# Phase 2: Exploratory Data Analysis (EDA)
## Step 2.1: Overview & Descriptive Statistics
In this notebook, we perform a deep dive into the cleaned dataset (`step1_quality_checked.csv`) to understand distributions, correlations, and potential patterns before feature engineering.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add project root to path
project_root = Path('../').resolve()
sys.path.append(str(project_root))

from config import PROCESSED_DATA_DIR, LOGS_DIR

# Set Plotting Style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)

print("‚úÖ Libraries Loaded")

In [None]:
# Load Quality-Checked Data
data_path = PROCESSED_DATA_DIR / "step1_quality_checked.csv"

try:
    df = pd.read_csv(data_path)
    print(f"‚úÖ Loaded Dataset from: {data_path}")
    print(f"Shape: {df.shape}")
except FileNotFoundError:
    print(f"‚ùå File not found: {data_path}. Please complete Phase 1 first.")

In [None]:
# --- Step 2.1: Overview & Descriptive Statistics ---
print("\n--- Dataset Info ---")
df.info()

print("\n--- Numerical Statistics (Summary) ---")
display(df.describe().round(4))

print("\n--- Categorical Statistics ---")
display(df.describe(include=['O']))

print("\n--- Target Balance Check (Refresher) ---")
if 'target_hit' in df.columns:
    print(df['target_hit'].value_counts(normalize=True).mul(100).round(2))


## Step 2.2: Distributions
Here we visualize the distribution of our target variables (`target_hit`, `stop_hit`) to check for imbalance, and examine key numeric features.

In [None]:
# 1. Target Distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

if 'target_hit' in df.columns:
    sns.countplot(x='target_hit', data=df, ax=axes[0], palette='viridis')
    axes[0].set_title('Target Hit Distribution')
    for p in axes[0].patches:
        axes[0].annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom')

if 'stop_hit' in df.columns:
    sns.countplot(x='stop_hit', data=df, ax=axes[1], palette='magma')
    axes[1].set_title('Stop Hit Distribution')
    for p in axes[1].patches:
        axes[1].annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# 2. Key Numeric Feature Distributions
# Select a few key features to visualize - modify this list as needed
key_numeric_cols = ['RSI', 'volume', 'close', 'time_to_event']
available_cols = [c for c in key_numeric_cols if c in df.columns]

if available_cols:
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(available_cols):
        plt.subplot(2, 2, i + 1)
        sns.histplot(df[col], kde=True, bins=50, color='skyblue')
        plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()
else:
    print("Key numeric columns for visualization not found.")

## Step 2.3: Correlations (Matrix & Heatmap)
Understanding relationships between numeric features and the target variable.

In [None]:
# Filter only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# 1. Correlation with Target
if 'target_hit' in numeric_df.columns:
    corr_target = numeric_df.corrwith(df['target_hit']).sort_values(ascending=False)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(y=corr_target.index[:20], x=corr_target.values[:20], palette='coolwarm')
    plt.title('Top 20 Features Correlated with target_hit')
    plt.xlabel('Correlation Coefficient')
    plt.show()

# 2. General Correlation Heatmap (Feature-Feature)
# Using a subset of features to keep the plot readable
focus_features = ['target_hit', 'stop_hit', 'RSI', 'close', 'volume', 'time_to_event']
# Add some potential EMA features if they exist
extra_feats = [c for c in ['ema21', 'ema50', 'ema100', 'atr_1h'] if c in df.columns]
focus_features.extend(extra_feats)

if all(col in df.columns for col in focus_features if col not in ['target_hit', 'stop_hit']):
    corr_matrix = df[focus_features].select_dtypes(include=np.number).corr()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='RdBu', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Matrix (Selected Features)')
    plt.show()
else:
    print("Some focus features for heatmap are missing.")

## Step 2.4: Pairplots / Boxplots / Histograms
Visualizing interactions between features and the target.

In [None]:
# 1. Boxplots: Compare Distribution of Key Features by Target Outcome
key_feats_for_boxplot = ['RSI', 'volume', 'time_to_event']
available_box_feats = [c for c in key_feats_for_boxplot if c in df.columns]

if 'target_hit' in df.columns and available_box_feats:
    plt.figure(figsize=(15, 5))
    for i, col in enumerate(available_box_feats):
        plt.subplot(1, 3, i + 1)
        sns.boxplot(x='target_hit', y=col, data=df, palette='Set2')
        plt.title(f'{col} by Target Hit')
    plt.tight_layout()
    plt.show()

In [None]:
# 2. Pairplot (Subset)
# We strictly limit columns to avoid crashing the kernel with a huge plot
pairplot_cols = ['target_hit', 'RSI', 'time_to_event']
if 'close' in df.columns:
    pairplot_cols.append('close')

available_pair_cols = [c for c in pairplot_cols if c in df.columns]

if len(available_pair_cols) > 1:
    sns.pairplot(df[available_pair_cols], hue='target_hit' if 'target_hit' in available_pair_cols else None, 
                 palette='viridis', diag_kind='kde')
    plt.suptitle("Pairplot of Selected Features", y=1.02)
    plt.show()

## Step 2.5: Insights Summary
Summary of findings from the EDA phase.

In [None]:
print("--- Auto-Generated Insights Summary ---")

# 1. Data Balance
if 'target_hit' in df.columns:
    target_counts = df['target_hit'].value_counts(normalize=True)
    print(f"üîπ Target Balance: The dataset has {target_counts.get(1, 0):.1%} positive samples (1) and {target_counts.get(0, 0):.1%} negative samples (0).")

# 2. Top Correlations
if 'target_hit' in numeric_df.columns:
    top_pos_corr = corr_target.head(3).index.tolist()
    top_neg_corr = corr_target.tail(3).index.tolist()
    print(f"üîπ Top Positive Correlations with Target: {top_pos_corr}")
    print(f"üîπ Top Negative Correlations with Target: {top_neg_corr}")

# 3. Outliers (using IQR method from previous step logic)
print("üîπ Feature Distributions: Check the histograms above for skewness. Features like 'volume' and 'time_to_event' often require log-transformation.")

print("\nüìù Note: Use these insights to guide Phase 3 (Feature Engineering). For example, if 'RSI' shows good separation in boxplots, prioritize it for interaction features.")