# 01 Data Exploration
- Load and clean data
- Exploratory Data Analysis (EDA)
- Visualizations: price, returns, volatility
- Outlier detection, missing values
- Stationarity tests (ADF)
- Value at Risk (VaR), Sharpe Ratio

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller

# Set plot style
sns.set(style='whitegrid')

# Load datasets
assets = ['TSLA', 'BND', 'SPY']
data = {}
for asset in assets:
    df = pd.read_csv(f'../data/processed/{asset}_processed.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    data[asset] = df
print('Data loaded for:', ', '.join(data.keys()))

# Check for missing values and show basic statistics
for asset, df in data.items():
    print(f'\n{asset} missing values:')
    print(df.isnull().sum())
    print(f'\n{asset} statistics:')
    print(df.describe())

# Visualize closing prices
plt.figure(figsize=(12,6))
for asset, df in data.items():
    plt.plot(df['Date'], df['Adj Close'], label=asset)
plt.title('Adjusted Close Price Over Time')
plt.xlabel('Date')
plt.ylabel('Adj Close')
plt.legend()
plt.show()

# Calculate and plot daily returns
plt.figure(figsize=(12,6))
for asset, df in data.items():
    plt.plot(df['Date'], df['Return'], label=asset)
plt.title('Daily Returns Over Time')
plt.xlabel('Date')
plt.ylabel('Return')
plt.legend()
plt.show()

# Rolling volatility (21-day std)
plt.figure(figsize=(12,6))
for asset, df in data.items():
    plt.plot(df['Date'], df['Volatility'], label=asset)
plt.title('21-Day Rolling Volatility')
plt.xlabel('Date')
plt.ylabel('Volatility')
plt.legend()
plt.show()

# Outlier detection: plot z-scores
plt.figure(figsize=(12,6))
for asset, df in data.items():
    plt.plot(df['Date'], df['z_score'], label=asset)
plt.title('Z-Score of Daily Returns')
plt.xlabel('Date')
plt.ylabel('Z-Score')
plt.legend()
plt.show()

# Augmented Dickey-Fuller test for stationarity
for asset, df in data.items():
    print(f'\nADF Test for {asset} (Adj Close):')
    result = adfuller(df['Adj Close'])
    print(f'ADF Statistic: {result[0]:.4f}, p-value: {result[1]:.4f}')
    print('Stationary' if result[1] < 0.05 else 'Non-stationary')
    print(f'ADF Test for {asset} (Return):')
    result = adfuller(df['Return'].dropna())
    print(f'ADF Statistic: {result[0]:.4f}, p-value: {result[1]:.4f}')
    print('Stationary' if result[1] < 0.05 else 'Non-stationary')

# Value at Risk (VaR) and Sharpe Ratio
risk_metrics = {}
for asset, df in data.items():
    returns = df['Return'].dropna()
    var_95 = np.percentile(returns, 5)
    sharpe = returns.mean() / returns.std() * np.sqrt(252)
    risk_metrics[asset] = {'VaR_95': var_95, 'Sharpe': sharpe}
    print(f'\n{asset} VaR (95%): {var_95:.4f}, Sharpe Ratio: {sharpe:.2f}')
