In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

import sys
sys.path.append('..')
from config.config import TICKERS, RAW_DATA_PATH, PROCESSED_DATA_PATH

In [52]:
stock_data = {}

for ticker in TICKERS:
    filepath = f"{RAW_DATA_PATH}{ticker}.csv"
    try:
        df = pd.read_csv(filepath, parse_dates=['Date'])
        stock_data[ticker] = df
        print(f"✓ Loaded {ticker}: {df.shape[0]} rows, {df.shape[1]} columns")
    except FileNotFoundError:
        print(f"✗ ERROR: File not found for {ticker}")
    except Exception as e:
        print(f"✗ ERROR loading {ticker}: {e}")

print(f"\nTotal stocks loaded: {len(stock_data)}")

✓ Loaded PG: 1254 rows, 6 columns
✓ Loaded KO: 1254 rows, 6 columns
✓ Loaded WMT: 1254 rows, 6 columns
✓ Loaded COST: 1255 rows, 6 columns
✓ Loaded JNJ: 1254 rows, 6 columns
✓ Loaded UNH: 1254 rows, 6 columns
✓ Loaded PFE: 1254 rows, 6 columns
✓ Loaded MRK: 1254 rows, 6 columns

Total stocks loaded: 8


In [53]:
stock_df = pd.concat(
    stock_data,
    names=["Ticker", "Row"]
).reset_index(level=0)
stock_df.head()

df_clean = stock_df.copy()

In [54]:
print("Missing Values: ", df_clean.isna().sum())

df_clean = df_clean.sort_values('Date').reset_index(drop=True)

print("Shape: ", df_clean.shape)
df_clean.head(10)


Missing Values:  Ticker    0
Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64
Shape:  (10033, 7)


Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume
0,PG,2021-01-19 00:00:00-05:00,119.742797,120.32519,117.854444,117.889748,9184900
1,KO,2021-01-19 00:00:00-05:00,41.93336,42.191464,41.503186,41.735477,29114500
2,MRK,2021-01-19 00:00:00-05:00,68.03759,68.306258,67.296727,67.728218,8467316
3,WMT,2021-01-19 00:00:00-05:00,45.045907,45.179824,44.432381,44.656612,24123600
4,PFE,2021-01-19 00:00:00-05:00,28.803764,28.99182,28.694066,28.780258,33451300
5,COST,2021-01-19 00:00:00-05:00,342.856024,343.414292,333.469982,335.390717,4533800
6,UNH,2021-01-19 00:00:00-05:00,328.332176,329.023479,324.046101,324.626801,3361300
7,JNJ,2021-01-19 00:00:00-05:00,140.549417,141.451704,139.768592,141.226135,9057600
8,COST,2021-01-19 00:00:00-05:00,342.856024,343.414292,333.469982,335.390717,4533800
9,KO,2021-01-20 00:00:00-05:00,41.761296,42.002193,41.563417,41.881744,23115800


In [55]:
def clean_data(df):
    df = df.drop_duplicates().reset_index(drop=True)
    missing_count = df.isna().sum().sum()
    if missing_count > 0:
        print(f"{ticker}: Found {missing_count} missing values")
        df = df.fillna(method='ffill', limit=3)
        df = df.dropna()
    return df

df_clean = clean_data(df_clean)
df_clean.head(10)

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume
0,PG,2021-01-19 00:00:00-05:00,119.742797,120.32519,117.854444,117.889748,9184900
1,KO,2021-01-19 00:00:00-05:00,41.93336,42.191464,41.503186,41.735477,29114500
2,MRK,2021-01-19 00:00:00-05:00,68.03759,68.306258,67.296727,67.728218,8467316
3,WMT,2021-01-19 00:00:00-05:00,45.045907,45.179824,44.432381,44.656612,24123600
4,PFE,2021-01-19 00:00:00-05:00,28.803764,28.99182,28.694066,28.780258,33451300
5,COST,2021-01-19 00:00:00-05:00,342.856024,343.414292,333.469982,335.390717,4533800
6,UNH,2021-01-19 00:00:00-05:00,328.332176,329.023479,324.046101,324.626801,3361300
7,JNJ,2021-01-19 00:00:00-05:00,140.549417,141.451704,139.768592,141.226135,9057600
8,KO,2021-01-20 00:00:00-05:00,41.761296,42.002193,41.563417,41.881744,23115800
9,WMT,2021-01-20 00:00:00-05:00,44.718902,45.363571,44.500898,45.316856,23775600
