## 1. Data Collection

In [None]:
import yfinance as yf
import pandas as pd

In [None]:
# Function to fetch stock data
def fetch_stock_data(ticker, start="2010-01-01", end="2025-01-01"):
    data = yf.download(ticker, start=start, end=end, multi_level_index=False)
    data = data[['Open', 'High', 'Low', 'Close', 'Volume']]
    data.reset_index(inplace=True)  # Ensure Date is a column
    return data

# Example usage - Replace "AAPL" with any stock symbol
stock_symbol = "AAPL"  # You can change this to any stock
data = fetch_stock_data(stock_symbol)

# Save to CSV (Optional)
data.to_parquet(f"./stock_data/{stock_symbol}_stock_data.parquet", index=False)

# Display first few rows
print(data.tail())

## 2. Data Integrity and Verification

In [None]:
print(data.isnull().sum())  # Check missing values

print(data.tail())

In [None]:
data.dtypes  # Check data types 

In [None]:
# Ensure Date remains intact
data = data.set_index('Date')  # Set Date as index before cleaning

# Remove duplicates and invalid prices
data = data.drop_duplicates()
data = data[data['Close'] > 0]

# Restore Date as a column
data = data.reset_index()  # Reset index, bringing back Date column

In [None]:
# Sort data by date
data = data.sort_values(by='Date', ascending=True)

In [None]:
print(data.info())  # Check data types
print("\n")
print(data.describe())  # Get summary statistics
data = data.reset_index(drop=True)

data.to_parquet("./stock_data/Cleaned_Stock_Data.parquet", index=False)