## 1. Data Collection

In [43]:
import yfinance as yf
import pandas as pd

In [44]:
# Function to fetch stock data
def fetch_stock_data(ticker, start="2010-01-01", end="2025-01-01"):
    data = yf.download(ticker, start=start, end=end, multi_level_index=False)
    data = data[['Open', 'High', 'Low', 'Close', 'Volume']]
    data.reset_index(inplace=True)  # Ensure Date is a column
    return data

# Example usage - Replace "AAPL" with any stock symbol
stock_symbol = "AAPL"  # You can change this to any stock
data = fetch_stock_data(stock_symbol)

# Save to CSV (Optional)
data.to_parquet(f"./stock_data/{stock_symbol}_stock_data.parquet", index=False)

# Display first few rows
print(data.tail())

[*********************100%***********************]  1 of 1 completed

           Date        Open        High         Low       Close    Volume
3769 2024-12-24  255.209412  257.926411  255.009620  257.916443  23234700
3770 2024-12-26  257.906429  259.814335  257.347047  258.735504  27237100
3771 2024-12-27  257.546826  258.415896  252.782075  255.309296  42355300
3772 2024-12-30  251.952985  253.221595  250.474615  251.923019  35557500
3773 2024-12-31  252.162760  253.001833  249.156056  250.144974  39480700





## 2. Data Integrity and Verification

In [45]:
print(data.isnull().sum())  # Check missing values

print(data.tail())

Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64
           Date        Open        High         Low       Close    Volume
3769 2024-12-24  255.209412  257.926411  255.009620  257.916443  23234700
3770 2024-12-26  257.906429  259.814335  257.347047  258.735504  27237100
3771 2024-12-27  257.546826  258.415896  252.782075  255.309296  42355300
3772 2024-12-30  251.952985  253.221595  250.474615  251.923019  35557500
3773 2024-12-31  252.162760  253.001833  249.156056  250.144974  39480700


In [46]:
data.dtypes  # Check data types 

Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume             int64
dtype: object

In [47]:
# Ensure Date remains intact
data = data.set_index('Date')  # Set Date as index before cleaning

# Remove duplicates and invalid prices
data = data.drop_duplicates()
data = data[data['Close'] > 0]

# Restore Date as a column
data = data.reset_index()  # Reset index, bringing back Date column

In [48]:
# Sort data by date
data = data.sort_values(by='Date', ascending=True)

In [49]:
print(data.info())  # Check data types
print("\n")
print(data.describe())  # Get summary statistics
data = data.reset_index(drop=True)

data.to_parquet("./stock_data/Cleaned_Stock_Data.parquet", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3774 entries, 0 to 3773
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3774 non-null   datetime64[ns]
 1   Open    3774 non-null   float64       
 2   High    3774 non-null   float64       
 3   Low     3774 non-null   float64       
 4   Close   3774 non-null   float64       
 5   Volume  3774 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 177.0 KB
None


                                Date         Open         High          Low  \
count                           3774  3774.000000  3774.000000  3774.000000   
mean   2017-07-01 21:42:38.346582016    67.465510    68.176227    66.797248   
min              2010-01-04 00:00:00     5.789106     5.898343     5.725309   
25%              2013-10-02 06:00:00    17.465267    17.645444    17.316001   
50%              2017-07-01 12:00:00    35.354725    35.728740    34.