## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

# Technical indicators library
try:
    import ta
    print("‚úÖ 'ta' library available")
except ImportError:
    print("‚ö†Ô∏è  Installing 'ta' library...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'ta'])
    import ta

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ Libraries imported successfully!")
print(f"üìÖ Feature Engineering Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 2. Configuration

In [None]:
# Directories
PROCESSED_DATA_DIR = '../../data/processed'
FEATURES_DATA_DIR = '../../data/features'
FIGURES_DIR = '../../outputs/figures'
os.makedirs(FEATURES_DATA_DIR, exist_ok=True)

# Ticker lists (25 assets)
TICKERS = {
    'indices': ['GSPC', 'IXIC', 'DJI', 'FTSE', 'N225', 'HSI', 'GDAXI'],
    'stocks': ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'JPM', 'XOM', 'JNJ', 'V', 'WMT', 'PG'],
    'crypto': ['BTC_USD', 'ETH_USD', 'BNB_USD', 'SOL_USD', 'ADA_USD'],
    'commodities': ['GC=F', 'CL=F']
}

ALL_TICKERS = [ticker for tickers in TICKERS.values() for ticker in tickers]

# Feature engineering parameters
WINDOWS = [5, 10, 20, 50]  # Window sizes for rolling features
LAG_PERIODS = [1, 5, 10, 20]  # Lag periods for autoregressive features

print(f"üìÇ Processed data directory: {PROCESSED_DATA_DIR}")
print(f"üìÇ Features data directory: {FEATURES_DATA_DIR}")
print(f"üìä Total assets: {len(ALL_TICKERS)}")
print(f"üîß Window sizes: {WINDOWS}")
print(f"üîß Lag periods: {LAG_PERIODS}")

## 3. Load Processed Data

In [None]:
# Load all processed data
print("üì• Loading processed data...\n")

data = {}

for ticker in ALL_TICKERS:
    try:
        filepath = os.path.join(PROCESSED_DATA_DIR, f"{ticker}_processed.csv")
        df = pd.read_csv(filepath, index_col=0, parse_dates=True)
        data[ticker] = df
        print(f"‚úÖ {ticker:10s} - {len(df):,} rows")
    except Exception as e:
        print(f"‚ùå {ticker:10s} - Error: {str(e)}")

print(f"\n‚úÖ Loaded {len(data)} datasets")

## 4. Technical Indicators Functions

In [None]:
def add_momentum_indicators(df):
    """
    Add momentum technical indicators.
    
    Returns:
    --------
    pd.DataFrame with added momentum indicators
    """
    df = df.copy()
    
    # RSI (Relative Strength Index) - 14 day
    df['RSI_14'] = ta.momentum.RSIIndicator(close=df['Close'], window=14).rsi()
    
    # MACD (Moving Average Convergence Divergence)
    macd = ta.trend.MACD(close=df['Close'])
    df['MACD'] = macd.macd()
    df['MACD_Signal'] = macd.macd_signal()
    df['MACD_Diff'] = macd.macd_diff()
    
    # Stochastic Oscillator
    stoch = ta.momentum.StochasticOscillator(high=df['High'], low=df['Low'], close=df['Close'])
    df['Stoch_K'] = stoch.stoch()
    df['Stoch_D'] = stoch.stoch_signal()
    
    # Rate of Change (ROC)
    df['ROC_10'] = ta.momentum.ROCIndicator(close=df['Close'], window=10).roc()
    
    # Williams %R
    df['Williams_R'] = ta.momentum.WilliamsRIndicator(high=df['High'], low=df['Low'], 
                                                        close=df['Close'], lbp=14).williams_r()
    
    return df

print("‚úÖ Momentum indicators function defined")

In [None]:
def add_trend_indicators(df):
    """
    Add trend technical indicators.
    """
    df = df.copy()
    
    # Simple Moving Averages
    df['SMA_5'] = ta.trend.SMAIndicator(close=df['Close'], window=5).sma_indicator()
    df['SMA_10'] = ta.trend.SMAIndicator(close=df['Close'], window=10).sma_indicator()
    df['SMA_20'] = ta.trend.SMAIndicator(close=df['Close'], window=20).sma_indicator()
    df['SMA_50'] = ta.trend.SMAIndicator(close=df['Close'], window=50).sma_indicator()
    
    # Exponential Moving Averages
    df['EMA_5'] = ta.trend.EMAIndicator(close=df['Close'], window=5).ema_indicator()
    df['EMA_10'] = ta.trend.EMAIndicator(close=df['Close'], window=10).ema_indicator()
    df['EMA_20'] = ta.trend.EMAIndicator(close=df['Close'], window=20).ema_indicator()
    
    # Average Directional Index (ADX)
    adx = ta.trend.ADXIndicator(high=df['High'], low=df['Low'], close=df['Close'])
    df['ADX'] = adx.adx()
    df['ADX_Pos'] = adx.adx_pos()
    df['ADX_Neg'] = adx.adx_neg()
    
    # Ichimoku Cloud
    ichimoku = ta.trend.IchimokuIndicator(high=df['High'], low=df['Low'])
    df['Ichimoku_A'] = ichimoku.ichimoku_a()
    df['Ichimoku_B'] = ichimoku.ichimoku_b()
    
    return df

print("‚úÖ Trend indicators function defined")

In [None]:
def add_volatility_indicators(df):
    """
    Add volatility technical indicators.
    """
    df = df.copy()
    
    # Bollinger Bands
    bollinger = ta.volatility.BollingerBands(close=df['Close'], window=20, window_dev=2)
    df['BB_High'] = bollinger.bollinger_hband()
    df['BB_Low'] = bollinger.bollinger_lband()
    df['BB_Mid'] = bollinger.bollinger_mavg()
    df['BB_Width'] = bollinger.bollinger_wband()
    df['BB_Pct'] = bollinger.bollinger_pband()
    
    # Average True Range (ATR)
    df['ATR_14'] = ta.volatility.AverageTrueRange(high=df['High'], low=df['Low'], 
                                                    close=df['Close'], window=14).average_true_range()
    
    # Keltner Channel
    keltner = ta.volatility.KeltnerChannel(high=df['High'], low=df['Low'], close=df['Close'])
    df['Keltner_High'] = keltner.keltner_channel_hband()
    df['Keltner_Low'] = keltner.keltner_channel_lband()
    df['Keltner_Mid'] = keltner.keltner_channel_mband()
    
    # Historical Volatility (rolling std of returns, annualized)
    df['HV_20'] = df['Returns'].rolling(window=20).std() * np.sqrt(252)
    df['HV_50'] = df['Returns'].rolling(window=50).std() * np.sqrt(252)
    
    return df

print("‚úÖ Volatility indicators function defined")

In [None]:
def add_volume_indicators(df):
    """
    Add volume technical indicators.
    """
    df = df.copy()
    
    # On-Balance Volume (OBV)
    df['OBV'] = ta.volume.OnBalanceVolumeIndicator(close=df['Close'], volume=df['Volume']).on_balance_volume()
    
    # Volume Moving Averages
    df['Volume_SMA_10'] = df['Volume'].rolling(window=10).mean()
    df['Volume_SMA_20'] = df['Volume'].rolling(window=20).mean()
    
    # Volume Rate of Change
    df['Volume_ROC'] = df['Volume'].pct_change(periods=10)
    
    # Money Flow Index (MFI)
    df['MFI'] = ta.volume.MFIIndicator(high=df['High'], low=df['Low'], 
                                        close=df['Close'], volume=df['Volume']).money_flow_index()
    
    # Chaikin Money Flow
    df['CMF'] = ta.volume.ChaikinMoneyFlowIndicator(high=df['High'], low=df['Low'], 
                                                      close=df['Close'], volume=df['Volume']).chaikin_money_flow()
    
    return df

print("‚úÖ Volume indicators function defined")

## 5. Advanced Features Functions

In [None]:
def add_lagged_features(df, lag_periods=[1, 5, 10, 20]):
    """
    Add lagged returns and prices.
    """
    df = df.copy()
    
    for lag in lag_periods:
        df[f'Returns_Lag_{lag}'] = df['Returns'].shift(lag)
        df[f'Close_Lag_{lag}'] = df['Close'].shift(lag)
        df[f'Volume_Lag_{lag}'] = df['Volume'].shift(lag)
    
    return df

print("‚úÖ Lagged features function defined")

In [None]:
def add_rolling_statistics(df, windows=[5, 10, 20, 50]):
    """
    Add rolling statistical features.
    """
    df = df.copy()
    
    for window in windows:
        # Rolling mean and std
        df[f'Returns_Mean_{window}'] = df['Returns'].rolling(window=window).mean()
        df[f'Returns_Std_{window}'] = df['Returns'].rolling(window=window).std()
        
        # Rolling skewness and kurtosis
        df[f'Returns_Skew_{window}'] = df['Returns'].rolling(window=window).skew()
        df[f'Returns_Kurt_{window}'] = df['Returns'].rolling(window=window).kurt()
        
        # Rolling min and max
        df[f'Returns_Min_{window}'] = df['Returns'].rolling(window=window).min()
        df[f'Returns_Max_{window}'] = df['Returns'].rolling(window=window).max()
        
        # Rolling quantiles
        df[f'Returns_Q25_{window}'] = df['Returns'].rolling(window=window).quantile(0.25)
        df[f'Returns_Q75_{window}'] = df['Returns'].rolling(window=window).quantile(0.75)
    
    return df

print("‚úÖ Rolling statistics function defined")

In [None]:
def add_fourier_features(df, n_components=5):
    """
    Add Fourier features to capture cyclical patterns.
    """
    df = df.copy()
    
    # Create time index (days since start)
    time_idx = np.arange(len(df))
    
    for i in range(1, n_components + 1):
        df[f'Fourier_Sin_{i}'] = np.sin(2 * np.pi * i * time_idx / 252)  # 252 trading days
        df[f'Fourier_Cos_{i}'] = np.cos(2 * np.pi * i * time_idx / 252)
    
    return df

print("‚úÖ Fourier features function defined")

In [None]:
def add_time_features(df):
    """
    Add time-based features (day of week, month, quarter, etc.).
    """
    df = df.copy()
    
    # Day of week (0=Monday, 4=Friday)
    df['DayOfWeek'] = df.index.dayofweek
    
    # Month (1-12)
    df['Month'] = df.index.month
    
    # Quarter (1-4)
    df['Quarter'] = df.index.quarter
    
    # Year
    df['Year'] = df.index.year
    
    # Day of month
    df['DayOfMonth'] = df.index.day
    
    # Is month end
    df['IsMonthEnd'] = df.index.is_month_end.astype(int)
    
    # Is quarter end
    df['IsQuarterEnd'] = df.index.is_quarter_end.astype(int)
    
    return df

print("‚úÖ Time features function defined")

## 6. Apply Feature Engineering to All Assets

In [None]:
# Apply all feature engineering steps
print("üîß Applying feature engineering to all assets...\n")

features_data = {}
feature_counts = {}

for ticker, df in data.items():
    print(f"Processing {ticker}...", end=' ')
    
    try:
        # Start with original data
        df_features = df.copy()
        original_cols = len(df_features.columns)
        
        # Add technical indicators
        df_features = add_momentum_indicators(df_features)
        df_features = add_trend_indicators(df_features)
        df_features = add_volatility_indicators(df_features)
        df_features = add_volume_indicators(df_features)
        
        # Add advanced features
        df_features = add_lagged_features(df_features, LAG_PERIODS)
        df_features = add_rolling_statistics(df_features, WINDOWS)
        df_features = add_fourier_features(df_features, n_components=5)
        df_features = add_time_features(df_features)
        
        # Store result
        features_data[ticker] = df_features
        new_cols = len(df_features.columns)
        feature_counts[ticker] = new_cols - original_cols
        
        print(f"‚úÖ Added {new_cols - original_cols} features (Total: {new_cols} columns)")
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")

print(f"\n‚úÖ Feature engineering complete for {len(features_data)} assets")
print(f"üìä Average features added: {np.mean(list(feature_counts.values())):.0f} per asset")

## 7. Feature Summary and Statistics

In [None]:
# Display feature summary for one asset
sample_ticker = 'GSPC'

if sample_ticker in features_data:
    df_sample = features_data[sample_ticker]
    
    print(f"üìä Feature Summary for {sample_ticker}:\n")
    print(f"Total features: {len(df_sample.columns)}")
    print(f"Total rows: {len(df_sample)}")
    print(f"\nFeature categories:")
    print(f"  - Original OHLCV: 5")
    print(f"  - Basic returns: 5")
    print(f"  - Momentum indicators: ~10")
    print(f"  - Trend indicators: ~15")
    print(f"  - Volatility indicators: ~10")
    print(f"  - Volume indicators: ~8")
    print(f"  - Lagged features: {len(LAG_PERIODS) * 3}")
    print(f"  - Rolling statistics: {len(WINDOWS) * 8}")
    print(f"  - Fourier features: 10")
    print(f"  - Time features: 7")
    
    print(f"\nFirst 10 feature names:")
    for i, col in enumerate(df_sample.columns[:10]):
        print(f"  {i+1}. {col}")
    
    print(f"\nLast 10 feature names:")
    for i, col in enumerate(df_sample.columns[-10:]):
        print(f"  {len(df_sample.columns)-10+i+1}. {col}")

In [None]:
# Check for missing values
print("üîç Missing Values Analysis:\n")

for ticker, df in features_data.items():
    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
    print(f"{ticker:10s}: {missing_pct:5.2f}% missing values")

print("\n‚ö†Ô∏è  Note: Missing values expected due to rolling windows and lags")

## 8. Visualize Key Features

In [None]:
# Visualize technical indicators for S&P 500
if 'GSPC' in features_data:
    df_viz = features_data['GSPC'].copy()
    
    fig, axes = plt.subplots(4, 1, figsize=(18, 14))
    
    # Price with Bollinger Bands
    ax = axes[0]
    ax.plot(df_viz.index, df_viz['Close'], label='Close', linewidth=1.5, color='blue')
    ax.plot(df_viz.index, df_viz['BB_High'], label='BB High', linewidth=1, linestyle='--', color='red')
    ax.plot(df_viz.index, df_viz['BB_Low'], label='BB Low', linewidth=1, linestyle='--', color='green')
    ax.plot(df_viz.index, df_viz['BB_Mid'], label='BB Mid', linewidth=1, linestyle='--', color='orange')
    ax.set_title('S&P 500 with Bollinger Bands', fontsize=14, fontweight='bold')
    ax.set_ylabel('Price ($)', fontsize=12)
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    
    # RSI
    ax = axes[1]
    ax.plot(df_viz.index, df_viz['RSI_14'], label='RSI (14)', linewidth=1.5, color='purple')
    ax.axhline(y=70, color='red', linestyle='--', alpha=0.5, label='Overbought (70)')
    ax.axhline(y=30, color='green', linestyle='--', alpha=0.5, label='Oversold (30)')
    ax.set_title('Relative Strength Index (RSI)', fontsize=14, fontweight='bold')
    ax.set_ylabel('RSI', fontsize=12)
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    
    # MACD
    ax = axes[2]
    ax.plot(df_viz.index, df_viz['MACD'], label='MACD', linewidth=1.5, color='blue')
    ax.plot(df_viz.index, df_viz['MACD_Signal'], label='Signal', linewidth=1.5, color='red')
    ax.bar(df_viz.index, df_viz['MACD_Diff'], label='Histogram', alpha=0.3, color='gray')
    ax.set_title('MACD (Moving Average Convergence Divergence)', fontsize=14, fontweight='bold')
    ax.set_ylabel('MACD', fontsize=12)
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    
    # Volume with OBV
    ax = axes[3]
    ax2 = ax.twinx()
    ax.bar(df_viz.index, df_viz['Volume'], alpha=0.3, color='blue', label='Volume')
    ax2.plot(df_viz.index, df_viz['OBV'], color='red', linewidth=1.5, label='OBV')
    ax.set_title('Volume and On-Balance Volume (OBV)', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Volume', fontsize=12, color='blue')
    ax2.set_ylabel('OBV', fontsize=12, color='red')
    ax.legend(loc='upper left')
    ax2.legend(loc='upper right')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, '04_technical_indicators.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print("‚úÖ Technical indicators visualization saved")

## 9. Feature Correlation Analysis

In [None]:
# Analyze feature correlations for S&P 500
if 'GSPC' in features_data:
    df_corr = features_data['GSPC'].copy()
    
    # Select a subset of important features for visualization
    important_features = ['Close', 'Returns', 'RSI_14', 'MACD', 'BB_Width', 'ATR_14', 
                          'ADX', 'Volume', 'OBV', 'HV_20', 'SMA_20', 'EMA_20']
    
    # Filter features that exist
    available_features = [f for f in important_features if f in df_corr.columns]
    
    # Compute correlation matrix
    corr_matrix = df_corr[available_features].corr()
    
    # Plot heatmap
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, vmin=-1, vmax=1, square=True, ax=ax)
    ax.set_title('Feature Correlation Matrix (S&P 500)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, '04_feature_correlations.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print("‚úÖ Feature correlation visualization saved")
    
    # Print highly correlated features
    print("\nüîó Highly Correlated Feature Pairs (|corr| > 0.8):")
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > 0.8:
                print(f"{corr_matrix.columns[i]:15s} <-> {corr_matrix.columns[j]:15s}: {corr_matrix.iloc[i, j]:6.3f}")

## 10. Save Engineered Features

In [None]:
# Save all engineered features
print("üíæ Saving engineered features...\n")

for ticker, df in features_data.items():
    filepath = os.path.join(FEATURES_DATA_DIR, f"{ticker}_features.csv")
    df.to_csv(filepath)
    print(f"‚úÖ {ticker:10s} ‚Üí {filepath}")

print(f"\n‚úÖ All features saved to {FEATURES_DATA_DIR}")

In [None]:
# Create feature summary CSV
summary_data = []

for ticker, df in features_data.items():
    summary_data.append({
        'Ticker': ticker,
        'Total_Features': len(df.columns),
        'Total_Rows': len(df),
        'Missing_Values': df.isnull().sum().sum(),
        'Missing_Pct': (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,
        'Start_Date': df.index.min().strftime('%Y-%m-%d'),
        'End_Date': df.index.max().strftime('%Y-%m-%d')
    })

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv(os.path.join(FEATURES_DATA_DIR, '_features_summary.csv'), index=False)

print("\nüìä Feature Engineering Summary:")
print(summary_df.to_string(index=False))
print(f"\nüíæ Summary saved to {FEATURES_DATA_DIR}/_features_summary.csv")

## 11. Summary and Next Steps

In [None]:
print("="*80)
print("üìä FEATURE ENGINEERING - SUMMARY")
print("="*80)

print("\n‚úÖ COMPLETED TASKS:")
print("   1. ‚úÖ Technical indicators (Momentum, Trend, Volatility, Volume)")
print("   2. ‚úÖ Lagged features (autoregressive components)")
print("   3. ‚úÖ Rolling statistics (mean, std, skew, kurtosis)")
print("   4. ‚úÖ Fourier features (cyclical patterns)")
print("   5. ‚úÖ Time features (day, month, quarter, etc.)")

print("\nüìÅ FILES GENERATED:")
print(f"   - {len(features_data)} feature files in {FEATURES_DATA_DIR}/")
print(f"   - Feature summary: _features_summary.csv")
print(f"   - Visualizations: 04_technical_indicators.png, 04_feature_correlations.png")

print("\nüìä STATISTICS:")
print(f"   - Total assets: {len(features_data)}")
print(f"   - Average features per asset: {np.mean([len(df.columns) for df in features_data.values()]):.0f}")
print(f"   - Feature categories: 6 (Technical, Lagged, Rolling, Fourier, Time, Original)")

print("\nüîë KEY FEATURES ADDED:")
print("   - Momentum: RSI, MACD, Stochastic, ROC, Williams %R")
print("   - Trend: SMA, EMA, ADX, Ichimoku")
print("   - Volatility: Bollinger Bands, ATR, Keltner Channel, Historical Vol")
print("   - Volume: OBV, MFI, CMF, Volume Moving Averages")
print("   - Advanced: Lagged returns, Rolling stats, Fourier, Time features")

print("\nüéØ NEXT STEPS:")
print("   1. Baseline models (ARIMA, LSTM, Prophet)")
print("   2. Feature selection and dimensionality reduction")
print("   3. Generative model development (TimeGAN, Diffusion)")
print("   4. Model training and evaluation")

print("\n" + "="*80)
print("‚úÖ Feature engineering complete! Ready for modeling.")
print("="*80)