In [None]:
# Yahoo Finance ETL - Exploratory Data Analysis

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define data paths
raw_path = "../data/raw/extract.parquet"
curated_path = "../data/curated/prices.parquet"

# Load data
if os.path.exists(raw_path):
    df_raw = pd.read_parquet(raw_path)
    print(f"Loaded RAW data: {df_raw.shape[0]:,} rows")
else:
    print("Raw data not found. Run the ETL first.")

if os.path.exists(curated_path):
    df_curated = pd.read_parquet(curated_path)
    print(f"Loaded CURATED data: {df_curated.shape[0]:,} rows")
else:
    print("Curated data not found. Run the transform step first.")


# Preview data
print("\nRAW Data Sample:")
display(df_raw.head())

print("\nCURATED Data Sample:")
display(df_curated.head())

# Basic validation
print("\nMissing Values per Column:")
display(df_curated.isna().sum())

print("\nUnique Symbols:")
print(df_curated['symbol'].unique())

# Statistical summary
display(df_curated.describe())

# Plot sample stock trend
symbol = 'AAPL'  # Change to any ticker from your symbols.txt
df_stock = df_curated[df_curated['symbol'] == symbol].sort_values('date')

plt.figure(figsize=(12,6))
plt.plot(df_stock['date'], df_stock['close'], label='Close', linewidth=2)
plt.plot(df_stock['date'], df_stock['moving_avg_7'], label='7-day MA', linestyle='--')
plt.plot(df_stock['date'], df_stock['moving_avg_30'], label='30-day MA', linestyle='--')
plt.title(f"{symbol} - Close Price and Moving Averages")
plt.xlabel("Date")
plt.ylabel("Price ($)")
plt.legend()
plt.grid(True)
plt.show()

# Daily return distribution
plt.figure(figsize=(8,5))
sns.histplot(df_stock['return_1d'].dropna(), bins=40, kde=True)
plt.title(f"{symbol} - Daily Return Distribution")
plt.xlabel("Daily Return (%)")
plt.ylabel("Frequency")
plt.show()

# Correlation check
corr = df_curated[['open','high','low','close','adj_close','volume','return_1d']].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()