#  🛠️ Feature Engineering for Stock Price Prediction
# 
# This notebook demonstrates feature engineering using the existing `src/data.py` and new `src/features.py` modules.
# 
# **Objectives:**
# - Load stock data using existing data functions
# - Create comprehensive technical features
# - Validate feature quality and relationships
# - Prepare features for modeling
# 
# **Dependencies:**
# - Uses existing `src/data.py` for data loading 
# - Uses new `src/features.py` for feature creation

# Setup and Imports

In [None]:
import warnings
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from features import (
    create_all_features,
    create_technical_indicators,
    create_price_features,
    create_volume_features,
    create_return_features,
    create_lag_features,
    validate_features,
    get_feature_importance_groups,
    process_stock_features
)
from data import (
    load_raw_data,
    clean_data,
    calculate_returns,
    validate_data_quality,
    get_trading_days
)
import sys
sys.path.append('../src')

# Import existing data functions

# Import new feature functions

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All modules imported successfully!")
print(f"📅 Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

#  Load Stock Data (USES existing data.py functions)

In [None]:
print("📊 Loading stock data using existing data.py functions...")

# Define stocks to analyze
tickers = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN']
stock_data = {}

# Load data using EXISTING load_raw_data function
for ticker in tickers:
    try:
        filepath = f'../data/raw/{ticker}.csv'
        data = load_raw_data(filepath)  # Using existing function

        # Validate data quality using existing function
        quality_check = validate_data_quality(data)

        if quality_check['overall']:
            stock_data[ticker] = data
            print(
                f"✅ {ticker}: {len(data)} rows loaded ({data.index.min().date()} to {data.index.max().date()})")
        else:
            print(f"⚠️ {ticker}: Data quality issues detected")

    except FileNotFoundError:
        print(f"❌ {ticker}: Data file not found - run data download first")
    except Exception as e:
        print(f"❌ {ticker}: Error loading data - {str(e)}")

print(
    f"\n📈 Successfully loaded {len(stock_data)} stocks for feature engineering")

# %%
# Cell 3: Basic Feature Creation - Technical Indicators
print("🔧 Creating technical indicators...")

# Use AAPL as example for detailed feature analysis
if 'AAPL' in stock_data:
    aapl_data = stock_data['AAPL'].copy()

    # Create technical indicators using NEW features.py
    aapl_with_tech = create_technical_indicators(aapl_data)

    print(f"Original columns: {len(aapl_data.columns)}")
    print(f"With technical indicators: {len(aapl_with_tech.columns)}")
    print(
        f"New features added: {len(aapl_with_tech.columns) - len(aapl_data.columns)}")

    # Display some technical indicators
    tech_columns = [
        col for col in aapl_with_tech.columns if col not in aapl_data.columns]
    print(f"\nTechnical indicators created: {tech_columns[:10]}...")

    # Show sample data with technical indicators
    display_cols = ['Close', 'SMA_20', 'EMA_12',
                    'RSI_14', 'MACD', 'BB_Upper', 'BB_Lower']
    print(f"\nSample technical indicators (last 5 days):")
    print(aapl_with_tech[display_cols].tail())

#  Price and Volume Features


In [None]:
print("💰 Creating price and volume features...")

if 'AAPL' in stock_data:
    # Add price features
    aapl_with_price = create_price_features(aapl_with_tech)

    # Add volume features
    aapl_with_volume = create_volume_features(aapl_with_price)

    # Add return features using existing calculate_returns function
    aapl_with_returns = create_return_features(aapl_with_volume)

    print(f"After price features: {len(aapl_with_price.columns)} columns")
    print(f"After volume features: {len(aapl_with_volume.columns)} columns")
    print(f"After return features: {len(aapl_with_returns.columns)} columns")

    # Show new price/volume features
    price_vol_features = ['High_Low_Ratio', 'Daily_Range_Pct', 'Gap_Pct',
                          'Volume_Ratio_20', 'Returns_1d', 'Volatility_10d']

    print(f"\nSample price/volume features (last 5 days):")
    print(aapl_with_returns[price_vol_features].tail())

#  Lag and Rolling Features

In [None]:
print("⏳ Creating lag and rolling window features...")

if 'AAPL' in stock_data:
    # Create lag features
    aapl_with_lags = create_lag_features(
        aapl_with_returns,
        lags=[1, 2, 3, 5, 10],
        columns=['Close', 'Volume', 'Returns_1d']
    )

    # Create rolling features (using NEW functions, not duplicating data.py)
    from features import create_rolling_features
    aapl_final = create_rolling_features(
        aapl_with_lags,
        windows=[5, 10, 20],
        columns=['Close', 'Volume'],
        statistics=['mean', 'std', 'min', 'max']
    )

    print(f"After lag features: {len(aapl_with_lags.columns)} columns")
    print(f"Final feature count: {len(aapl_final.columns)} columns")

    # Show lag features
    lag_features = [col for col in aapl_final.columns if 'lag_' in col]
    print(f"\nLag features created: {lag_features[:8]}...")

    # Show rolling features
    rolling_features = [col for col in aapl_final.columns if 'rolling_' in col]
    print(f"\nRolling features created: {rolling_features[:8]}...")

# Complete Feature Creation for All Stocks

In [None]:
print("🏭 Creating complete feature sets for all stocks...")

# Create complete feature sets using the all-in-one function
complete_features = {}

for ticker in stock_data.keys():
    print(f"Processing {ticker}...")

    # Use the comprehensive feature creation function
    features_df = create_all_features(
        stock_data[ticker],
        include_technical=True,
        include_lags=True,
        include_rolling=True,
        lag_periods=[1, 2, 3, 5],
        rolling_windows=[5, 10, 20]
    )

    # Add ticker identification
    features_df['Ticker'] = ticker
    complete_features[ticker] = features_df

    print(f"  ✅ {ticker}: {len(features_df.columns)} features created")

print(f"\n🎯 Feature engineering completed for {len(complete_features)} stocks")

# Feature Validation and Quality Analysis

In [None]:
print("🔍 Validating feature quality...")

# Validate features for each stock
validation_reports = {}

for ticker, features_df in complete_features.items():
    print(f"\n📊 Validating features for {ticker}:")

    # Use NEW validation function
    report = validate_features(features_df)
    validation_reports[ticker] = report

    print(f"  • Total features: {report['total_features']}")
    print(f"  • Total rows: {report['total_rows']}")
    print(
        f"  • Features with missing values: {len([k for k, v in report['missing_values'].items() if v > 0])}")
    print(
        f"  • Features with infinite values: {len(report['infinite_values'])}")
    print(f"  • Constant features: {len(report['constant_features'])}")
    print(
        f"  • High correlation pairs: {len(report['high_correlation_pairs'])}")

    # Show problematic features if any
    if report['infinite_values']:
        print(
            f"  ⚠️ Infinite values in: {list(report['infinite_values'].keys())}")

    if report['constant_features']:
        print(f"  ⚠️ Constant features: {report['constant_features']}")

# Feature Importance Groups Analysis

In [None]:
print("🎯 Analyzing feature groups...")

# Get predefined feature groups
feature_groups = get_feature_importance_groups()

print("Feature groups defined:")
for group_name, features in feature_groups.items():
    print(f"  • {group_name}: {len(features)} features")

# Analyze feature availability across stocks
if 'AAPL' in complete_features:
    aapl_features = complete_features['AAPL']
    available_features = set(aapl_features.columns)

    print(f"\nFeature availability analysis (AAPL example):")
    for group_name, group_features in feature_groups.items():
        available_in_group = [
            f for f in group_features if f in available_features]
        coverage = len(available_in_group) / len(group_features) * 100
        print(
            f"  • {group_name}: {len(available_in_group)}/{len(group_features)} features ({coverage:.1f}% coverage)")

# Feature Visualization - Technical Indicators

In [None]:
print("📈 Visualizing technical indicators...")

if 'AAPL' in complete_features:
    aapl_features = complete_features['AAPL']

    # Create comprehensive technical analysis plot
    fig, axes = plt.subplots(3, 2, figsize=(20, 15))

    # Price with moving averages
    ax1 = axes[0, 0]
    aapl_features[['Close', 'SMA_20', 'SMA_50',
                   'EMA_12', 'EMA_26']].plot(ax=ax1, alpha=0.8)
    ax1.set_title('AAPL: Price with Moving Averages', fontsize=14)
    ax1.set_ylabel('Price ($)')
    ax1.grid(True, alpha=0.3)
    ax1.legend(loc='upper left')

    # RSI
    ax2 = axes[0, 1]
    aapl_features['RSI_14'].plot(ax=ax2, color='purple', linewidth=2)
    ax2.axhline(y=70, color='red', linestyle='--',
                alpha=0.7, label='Overbought (70)')
    ax2.axhline(y=30, color='green', linestyle='--',
                alpha=0.7, label='Oversold (30)')
    ax2.set_title('AAPL: RSI (14-day)', fontsize=14)
    ax2.set_ylabel('RSI')
    ax2.set_ylim(0, 100)
    ax2.grid(True, alpha=0.3)
    ax2.legend()

    # MACD
    ax3 = axes[1, 0]
    aapl_features[['MACD', 'MACD_Signal']].plot(ax=ax3)
    aapl_features['MACD_Histogram'].plot(
        ax=ax3, kind='bar', alpha=0.3, color='gray')
    ax3.set_title('AAPL: MACD Indicator', fontsize=14)
    ax3.set_ylabel('MACD')
    ax3.grid(True, alpha=0.3)
    ax3.legend()

    # Bollinger Bands
    ax4 = axes[1, 1]
    aapl_features[['Close', 'BB_Upper', 'BB_Middle', 'BB_Lower']].plot(
        ax=ax4, alpha=0.8)
    ax4.fill_between(aapl_features.index, aapl_features['BB_Upper'], aapl_features['BB_Lower'],
                     alpha=0.1, color='blue')
    ax4.set_title('AAPL: Bollinger Bands', fontsize=14)
    ax4.set_ylabel('Price ($)')
    ax4.grid(True, alpha=0.3)
    ax4.legend()

    # Volume analysis
    ax5 = axes[2, 0]
    ax5_twin = ax5.twinx()
    aapl_features['Volume'].plot(ax=ax5, alpha=0.6, color='orange')
    aapl_features['Volume_Ratio_20'].plot(
        ax=ax5_twin, color='red', linewidth=2)
    ax5.set_title('AAPL: Volume and Volume Ratio', fontsize=14)
    ax5.set_ylabel('Volume', color='orange')
    ax5_twin.set_ylabel('Volume Ratio (20-day)', color='red')
    ax5.grid(True, alpha=0.3)

    # Returns and Volatility
    ax6 = axes[2, 1]
    ax6_twin = ax6.twinx()
    aapl_features['Returns_1d'].plot(ax=ax6, alpha=0.7, color='blue')
    aapl_features['Volatility_10d'].plot(ax=ax6_twin, color='red', linewidth=2)
    ax6.set_title('AAPL: Daily Returns and 10-day Volatility', fontsize=14)
    ax6.set_ylabel('Daily Returns', color='blue')
    ax6_twin.set_ylabel('Volatility (10-day)', color='red')
    ax6.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

# Feature Correlation Analysis

In [None]:
print("🔗 Analyzing feature correlations...")

if 'AAPL' in complete_features:
    aapl_features = complete_features['AAPL']

    # Select key technical features for correlation analysis
    key_features = [
        'Close', 'SMA_20', 'RSI_14', 'MACD', 'BB_Width',
        'Volume_Ratio_20', 'Returns_1d', 'Volatility_10d',
        'High_Low_Ratio', 'Daily_Range_Pct', 'ATR_14'
    ]

    # Filter available features
    available_key_features = [
        f for f in key_features if f in aapl_features.columns]

    if len(available_key_features) > 1:
        # Calculate correlation matrix
        corr_matrix = aapl_features[available_key_features].corr()

        # Create correlation heatmap
        plt.figure(figsize=(12, 10))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                    square=True, linewidths=0.5, cbar_kws={"shrink": .5}, fmt='.2f')
        plt.title(
            'AAPL: Feature Correlation Matrix (Key Technical Indicators)', fontsize=16)
        plt.tight_layout()
        plt.show()

        # Find highly correlated feature pairs
        high_corr_pairs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.7:  # High correlation threshold
                    high_corr_pairs.append((
                        corr_matrix.columns[i],
                        corr_matrix.columns[j],
                        corr_val
                    ))

        if high_corr_pairs:
            print("\n🔍 Highly correlated feature pairs (|r| > 0.7):")
            for feat1, feat2, corr in high_corr_pairs:
                print(f"  • {feat1} ↔ {feat2}: r = {corr:.3f}")
        else:
            print("\n✅ No highly correlated features detected (|r| > 0.7)")

# Feature Distributions Analysis

In [None]:
print("📊 Analyzing feature distributions...")

if 'AAPL' in complete_features:
    aapl_features = complete_features['AAPL']

    # Select various types of features for distribution analysis
    distribution_features = [
        'Returns_1d', 'RSI_14', 'Volume_Ratio_20', 'BB_Width',
        'Daily_Range_Pct', 'MACD', 'Volatility_10d', 'Williams_R'
    ]

    # Filter available features
    available_dist_features = [
        f for f in distribution_features if f in aapl_features.columns]

    if available_dist_features:
        # Create distribution plots
        n_features = len(available_dist_features)
        n_cols = 4
        n_rows = (n_features + n_cols - 1) // n_cols

        fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
        axes = axes.flatten() if n_rows > 1 else [
            axes] if n_rows == 1 else axes

        for i, feature in enumerate(available_dist_features):
            if i < len(axes):
                # Remove NaN and infinite values for plotting
                feature_data = aapl_features[feature].replace(
                    [np.inf, -np.inf], np.nan).dropna()

                if len(feature_data) > 0:
                    # Plot histogram
                    axes[i].hist(feature_data, bins=50, alpha=0.7,
                                 edgecolor='black', linewidth=0.5)
                    axes[i].set_title(f'{feature} Distribution', fontsize=12)
                    axes[i].set_xlabel(feature)
                    axes[i].set_ylabel('Frequency')
                    axes[i].grid(True, alpha=0.3)

                    # Add statistics text
                    stats_text = f'Mean: {feature_data.mean():.3f}\nStd: {feature_data.std():.3f}'
                    axes[i].text(0.05, 0.95, stats_text, transform=axes[i].transAxes,
                                 verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
                else:
                    axes[i].text(0.5, 0.5, f'No valid data\nfor {feature}',
                                 transform=axes[i].transAxes, ha='center', va='center')

        # Hide unused subplots
        for i in range(len(available_dist_features), len(axes)):
            axes[i].set_visible(False)

        plt.tight_layout()
        plt.show()

        # Print distribution statistics
        print(
            f"\n📈 Distribution statistics for {len(available_dist_features)} features:")
        for feature in available_dist_features:
            feature_data = aapl_features[feature].replace(
                [np.inf, -np.inf], np.nan).dropna()
            if len(feature_data) > 0:
                print(f"  • {feature:20s}: μ={feature_data.mean():8.4f}, σ={feature_data.std():8.4f}, "
                      f"min={feature_data.min():8.4f}, max={feature_data.max():8.4f}")

# Missing Values Analysis

In [None]:
print("🕳️ Analyzing missing values...")

# Analyze missing values across all stocks
missing_analysis = {}

for ticker, features_df in complete_features.items():
    # Calculate missing value percentages
    missing_pct = (features_df.isnull().sum() / len(features_df)) * 100
    missing_features = missing_pct[missing_pct >
                                   0].sort_values(ascending=False)

    missing_analysis[ticker] = {
        'total_features': len(features_df.columns),
        'features_with_missing': len(missing_features),
        'max_missing_pct': missing_features.iloc[0] if len(missing_features) > 0 else 0,
        'missing_details': missing_features.to_dict()
    }

# Create missing values visualization
if missing_analysis:
    fig, axes = plt.subplots(1, 2, figsize=(18, 6))

    # Missing values by stock
    stocks = list(missing_analysis.keys())
    missing_counts = [missing_analysis[stock]
                      ['features_with_missing'] for stock in stocks]
    max_missing = [missing_analysis[stock]['max_missing_pct']
                   for stock in stocks]

    ax1 = axes[0]
    bars = ax1.bar(stocks, missing_counts, alpha=0.7, color='coral')
    ax1.set_title(
        'Number of Features with Missing Values by Stock', fontsize=14)
    ax1.set_ylabel('Features with Missing Values')
    ax1.set_xlabel('Stock Ticker')

    # Add value labels on bars
    for bar, count in zip(bars, missing_counts):
        if count > 0:
            ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                     f'{count}', ha='center', va='bottom', fontweight='bold')

    # Maximum missing percentage by stock
    ax2 = axes[1]
    bars2 = ax2.bar(stocks, max_missing, alpha=0.7, color='lightblue')
    ax2.set_title('Maximum Missing Value Percentage by Stock', fontsize=14)
    ax2.set_ylabel('Maximum Missing Percentage (%)')
    ax2.set_xlabel('Stock Ticker')

    # Add percentage labels
    for bar, pct in zip(bars2, max_missing):
        if pct > 0:
            ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                     f'{pct:.1f}%', ha='center', va='bottom', fontweight='bold')

    plt.tight_layout()
    plt.show()

# Print detailed missing values report
print(f"\n📋 Missing values summary:")
for ticker, analysis in missing_analysis.items():
    print(f"\n{ticker}:")
    print(f"  • Total features: {analysis['total_features']}")
    print(
        f"  • Features with missing values: {analysis['features_with_missing']}")
    if analysis['features_with_missing'] > 0:
        print(
            f"  • Maximum missing percentage: {analysis['max_missing_pct']:.2f}%")
        # Show top 5 features with most missing values
        top_missing = dict(list(analysis['missing_details'].items())[:5])
        print(f"  • Top missing features: {top_missing}")

# Feature Data Export and Summary

In [None]:
print("💾 Exporting features and creating summary...")

# Create output directory for processed features
os.makedirs('../data/features', exist_ok=True)

# Export feature datasets
export_summary = {}

for ticker, features_df in complete_features.items():
    # Clean feature names
    from features import clean_feature_names
    clean_features = clean_feature_names(features_df)

    # Export to CSV
    output_path = f'../data/features/{ticker}_features.csv'
    clean_features.to_csv(output_path)

    export_summary[ticker] = {
        'output_file': output_path,
        'total_features': len(clean_features.columns),
        'total_rows': len(clean_features),
        'date_range': f"{clean_features.index.min().date()} to {clean_features.index.max().date()}"
    }

    print(f"✅ {ticker}: Features exported to {output_path}")

# Create master summary report
summary_report = {
    'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'stocks_processed': list(complete_features.keys()),
    'total_stocks': len(complete_features),
    'feature_engineering_steps': [
        'Technical indicators (SMA, EMA, RSI, MACD, Bollinger Bands, etc.)',
        'Price features (ratios, ranges, gaps)',
        'Volume features (ratios, moving averages)',
        'Return features (simple, log returns, volatility)',
        'Lag features (1, 2, 3, 5 periods)',
        'Rolling window features (5, 10, 20 periods)'
    ],
    'export_summary': export_summary,
    'validation_reports': validation_reports
}

# Export summary report
with open('../data/features/feature_engineering_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print(f"\n📊 Feature Engineering Summary:")
print(f"  • Stocks processed: {summary_report['total_stocks']}")
print(
    f"  • Feature engineering steps: {len(summary_report['feature_engineering_steps'])}")
print(f"  • Output files created: {len(export_summary)}")
print(f"  • Summary report: ../data/features/feature_engineering_summary.json")

# Next Steps and Recommendations

In [None]:
print("\n🎯 Feature Engineering Complete - Next Steps:")

print("""
✅ COMPLETED IN THIS NOTEBOOK:
  • Loaded stock data using existing data.py functions (NO DUPLICATION)
  • Created comprehensive technical indicators
  • Generated price, volume, and return features  
  • Added lag and rolling window features
  • Validated feature quality and identified issues
  • Analyzed feature correlations and distributions
  • Exported processed features for modeling

📋 KEY FINDINGS:
  • Technical indicators successfully created for all stocks
  • Some features may have missing values due to calculation windows
  • Feature correlations identified for potential dimensionality reduction
  • All features exported to ../data/features/ directory

🚀 RECOMMENDED NEXT STEPS:
  1. Handle missing values (forward fill, interpolation, or removal)
  2. Feature selection based on correlation analysis
  3. Feature scaling/normalization for modeling
  4. Create train/validation/test splits with proper time-series considerations
  5. Begin baseline model training with processed features

📁 OUTPUT FILES:
  • ../data/features/{ticker}_features.csv - Feature datasets for each stock  
  • ../data/features/feature_engineering_summary.json - Complete analysis summary

🔧 READY FOR MODELING:
  The features are now ready for the next phase - baseline model training.
  Use the exported feature files as input to your modeling pipeline.
""")

print("🏁 Feature engineering notebook execution completed successfully!")