# Financial ML System - Feature Engineering

This notebook creates technical indicators and features for model training.

## 1. Setup

In [None]:
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

PROJECT_ROOT = Path('/content/financial-ml-system')
sys.path.insert(0, str(PROJECT_ROOT))

from src.utils.constants import DATA_DIR, REGIME_BEAR, REGIME_NEUTRAL, REGIME_BULL, REGIME_NAMES
from src.utils.config_loader import config

print("Setup complete")

## 2. Load Processed Data

In [None]:
ticker = config.get('data.default_ticker', 'SPY')
data_file = DATA_DIR / 'processed' / f"{ticker}_processed.csv"

if not data_file.exists():
    raise FileNotFoundError(f"Run 01_data_pipeline.ipynb first to create {data_file}")

data = pd.read_csv(data_file, index_col=0, parse_dates=True)
print(f"Loaded {len(data)} rows")
data.head()

## 3. Technical Indicators Module

In [None]:
class TechnicalIndicators:
    """Calculate technical indicators."""
    
    @staticmethod
    def sma(data: pd.Series, period: int) -> pd.Series:
        """Simple Moving Average."""
        return data.rolling(window=period).mean()
    
    @staticmethod
    def ema(data: pd.Series, period: int) -> pd.Series:
        """Exponential Moving Average."""
        return data.ewm(span=period, adjust=False).mean()
    
    @staticmethod
    def rsi(data: pd.Series, period: int = 14) -> pd.Series:
        """Relative Strength Index."""
        delta = data.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))
    
    @staticmethod
    def macd(data: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9) -> tuple:
        """MACD indicator."""
        ema_fast = data.ewm(span=fast, adjust=False).mean()
        ema_slow = data.ewm(span=slow, adjust=False).mean()
        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal, adjust=False).mean()
        histogram = macd_line - signal_line
        return macd_line, signal_line, histogram
    
    @staticmethod
    def bollinger_bands(data: pd.Series, period: int = 20, std: float = 2) -> tuple:
        """Bollinger Bands."""
        sma = data.rolling(window=period).mean()
        rolling_std = data.rolling(window=period).std()
        upper = sma + (rolling_std * std)
        lower = sma - (rolling_std * std)
        return upper, sma, lower
    
    @staticmethod
    def atr(high: pd.Series, low: pd.Series, close: pd.Series, period: int = 14) -> pd.Series:
        """Average True Range."""
        high_low = high - low
        high_close = np.abs(high - close.shift())
        low_close = np.abs(low - close.shift())
        ranges = pd.concat([high_low, high_close, low_close], axis=1)
        true_range = ranges.max(axis=1)
        return true_range.rolling(window=period).mean()
    
    @staticmethod
    def stochastic(high: pd.Series, low: pd.Series, close: pd.Series, period: int = 14) -> tuple:
        """Stochastic Oscillator."""
        lowest_low = low.rolling(window=period).min()
        highest_high = high.rolling(window=period).max()
        k = 100 * ((close - lowest_low) / (highest_high - lowest_low))
        d = k.rolling(window=3).mean()
        return k, d
    
    @staticmethod
    def obv(close: pd.Series, volume: pd.Series) -> pd.Series:
        """On-Balance Volume."""
        obv = (np.sign(close.diff()) * volume).fillna(0).cumsum()
        return obv

print("TechnicalIndicators class defined")

## 4. Calculate All Features

In [None]:
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create all technical features."""
    features = df.copy()
    ti = TechnicalIndicators()
    
    print("Calculating technical indicators...")
    
    # Moving Averages
    for period in config.get('features.sma_periods', [5, 20, 50]):
        features[f'SMA_{period}'] = ti.sma(df['Close'], period)
        features[f'SMA_Ratio_{period}'] = df['Close'] / features[f'SMA_{period}']
    
    # RSI
    rsi_period = config.get('features.rsi_period', 14)
    features['RSI'] = ti.rsi(df['Close'], rsi_period)
    
    # MACD
    macd, signal, hist = ti.macd(df['Close'])
    features['MACD'] = macd
    features['MACD_Signal'] = signal
    features['MACD_Hist'] = hist
    
    # Bollinger Bands
    bb_period = config.get('features.bb_period', 20)
    upper, middle, lower = ti.bollinger_bands(df['Close'], bb_period)
    features['BB_Upper'] = upper
    features['BB_Middle'] = middle
    features['BB_Lower'] = lower
    features['BB_Width'] = (upper - lower) / middle
    features['BB_Position'] = (df['Close'] - lower) / (upper - lower)
    
    # ATR
    atr_period = config.get('features.atr_period', 14)
    features['ATR'] = ti.atr(df['High'], df['Low'], df['Close'], atr_period)
    features['ATR_Pct'] = features['ATR'] / df['Close']
    
    # Stochastic
    stoch_k, stoch_d = ti.stochastic(df['High'], df['Low'], df['Close'])
    features['Stoch_K'] = stoch_k
    features['Stoch_D'] = stoch_d
    
    # Volume indicators
    features['Volume_SMA'] = ti.sma(df['Volume'], 20)
    features['Volume_Ratio'] = df['Volume'] / features['Volume_SMA']
    features['OBV'] = ti.obv(df['Close'], df['Volume'])
    
    # Volatility
    features['Volatility'] = df['Returns'].rolling(window=20).std()
    
    # Rate of Change
    features['ROC'] = df['Close'].pct_change(periods=10)
    
    # Momentum
    features['Momentum'] = df['Close'] - df['Close'].shift(10)
    
    print(f"Created {len(features.columns)} features")
    return features

features_df = create_features(data)
features_df.head()

## 5. Create Market Regime Labels

In [None]:
def create_regime_labels(df: pd.DataFrame, forward_window: int = 20) -> pd.DataFrame:
    """Create market regime labels based on forward returns."""
    result = df.copy()
    
    # Calculate forward returns
    result['Forward_Return'] = result['Close'].pct_change(forward_window).shift(-forward_window)
    
    # Calculate forward volatility
    result['Forward_Volatility'] = result['Returns'].rolling(forward_window).std().shift(-forward_window)
    
    # Define regime based on returns and volatility
    conditions = [
        (result['Forward_Return'] < -0.02),  # Bear: <-2% return
        (result['Forward_Return'] > 0.02),   # Bull: >2% return
    ]
    
    choices = [REGIME_BEAR, REGIME_BULL]
    result['Regime'] = np.select(conditions, choices, default=REGIME_NEUTRAL)
    
    # Remove rows with NaN in Regime (last forward_window rows)
    result = result.dropna(subset=['Regime'])
    result['Regime'] = result['Regime'].astype(int)
    
    return result

features_df = create_regime_labels(features_df)

print("\nRegime Distribution:")
print(features_df['Regime'].value_counts().sort_index())
print("\nRegime Names:")
for regime, name in REGIME_NAMES.items():
    count = (features_df['Regime'] == regime).sum()
    pct = count / len(features_df) * 100
    print(f"{name}: {count} ({pct:.1f}%)")

## 6. Visualize Features

In [None]:
# Plot price with moving averages
fig, ax = plt.subplots(figsize=(14, 7))

ax.plot(features_df.index, features_df['Close'], label='Close', linewidth=2)
ax.plot(features_df.index, features_df['SMA_5'], label='SMA 5', alpha=0.7)
ax.plot(features_df.index, features_df['SMA_20'], label='SMA 20', alpha=0.7)
ax.plot(features_df.index, features_df['SMA_50'], label='SMA 50', alpha=0.7)

ax.set_title('Price with Moving Averages', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Price')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / 'moving_averages.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot RSI
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8), height_ratios=[2, 1])

ax1.plot(features_df.index, features_df['Close'], label='Close')
ax1.set_title('Price', fontsize=14, fontweight='bold')
ax1.set_ylabel('Price')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(features_df.index, features_df['RSI'], label='RSI', color='purple')
ax2.axhline(y=70, color='r', linestyle='--', alpha=0.5, label='Overbought')
ax2.axhline(y=30, color='g', linestyle='--', alpha=0.5, label='Oversold')
ax2.set_title('RSI', fontsize=14, fontweight='bold')
ax2.set_xlabel('Date')
ax2.set_ylabel('RSI')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / 'rsi_indicator.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot Bollinger Bands
fig, ax = plt.subplots(figsize=(14, 7))

ax.plot(features_df.index, features_df['Close'], label='Close', linewidth=2)
ax.plot(features_df.index, features_df['BB_Upper'], label='BB Upper', linestyle='--', alpha=0.7)
ax.plot(features_df.index, features_df['BB_Middle'], label='BB Middle', linestyle='--', alpha=0.7)
ax.plot(features_df.index, features_df['BB_Lower'], label='BB Lower', linestyle='--', alpha=0.7)
ax.fill_between(features_df.index, features_df['BB_Upper'], features_df['BB_Lower'], alpha=0.1)

ax.set_title('Bollinger Bands', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Price')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / 'bollinger_bands.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Plot regime distribution over time
fig, ax = plt.subplots(figsize=(14, 7))

for regime, name in REGIME_NAMES.items():
    mask = features_df['Regime'] == regime
    ax.scatter(features_df[mask].index, features_df[mask]['Close'], 
              label=name, alpha=0.6, s=20)

ax.set_title('Market Regimes Over Time', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Price')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / 'market_regimes.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Feature Correlation Analysis

In [None]:
# Select numerical features for correlation
feature_cols = [col for col in features_df.columns if col not in 
                ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 
                 'Forward_Return', 'Forward_Volatility']]

correlation_data = features_df[feature_cols].dropna()

# Calculate correlation matrix
corr_matrix = correlation_data.corr()

# Plot correlation heatmap
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(PROJECT_ROOT / 'results' / 'feature_correlation.png', dpi=300, bbox_inches='tight')
plt.show()

print("Features with high correlation (>0.9):")
high_corr = np.where(np.abs(corr_matrix) > 0.9)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y], corr_matrix.iloc[x, y]) 
                   for x, y in zip(*high_corr) if x < y]
for feat1, feat2, corr in high_corr_pairs:
    print(f"{feat1} <-> {feat2}: {corr:.3f}")

## 8. Prepare Final Dataset

In [None]:
# Remove rows with NaN values
features_clean = features_df.dropna()

print(f"Original data: {len(features_df)} rows")
print(f"After removing NaN: {len(features_clean)} rows")
print(f"Features: {len(features_clean.columns)} columns")

# Display feature summary
print("\nFeature Summary:")
print("-" * 50)
print(features_clean.describe())

## 9. Save Features

In [None]:
# Save features
output_file = DATA_DIR / 'processed' / f"{ticker}_features.csv"
features_clean.to_csv(output_file)

print(f"Features saved to: {output_file}")
print(f"Shape: {features_clean.shape}")
print(f"Date range: {features_clean.index.min()} to {features_clean.index.max()}")

## 10. Create Feature Engineering Module

In [None]:
# Save feature engineering module
feature_module = '''"""Feature engineering module."""

import numpy as np
import pandas as pd

class TechnicalIndicators:
    """Calculate technical indicators."""
    
    @staticmethod
    def sma(data: pd.Series, period: int) -> pd.Series:
        return data.rolling(window=period).mean()
    
    @staticmethod
    def ema(data: pd.Series, period: int) -> pd.Series:
        return data.ewm(span=period, adjust=False).mean()
    
    @staticmethod
    def rsi(data: pd.Series, period: int = 14) -> pd.Series:
        delta = data.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))
    
    @staticmethod
    def macd(data: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
        ema_fast = data.ewm(span=fast, adjust=False).mean()
        ema_slow = data.ewm(span=slow, adjust=False).mean()
        macd_line = ema_fast - ema_slow
        signal_line = macd_line.ewm(span=signal, adjust=False).mean()
        histogram = macd_line - signal_line
        return macd_line, signal_line, histogram
    
    @staticmethod
    def bollinger_bands(data: pd.Series, period: int = 20, std: float = 2):
        sma = data.rolling(window=period).mean()
        rolling_std = data.rolling(window=period).std()
        upper = sma + (rolling_std * std)
        lower = sma - (rolling_std * std)
        return upper, sma, lower
    
    @staticmethod
    def atr(high: pd.Series, low: pd.Series, close: pd.Series, period: int = 14):
        high_low = high - low
        high_close = np.abs(high - close.shift())
        low_close = np.abs(low - close.shift())
        ranges = pd.concat([high_low, high_close, low_close], axis=1)
        true_range = ranges.max(axis=1)
        return true_range.rolling(window=period).mean()
'''

with open(PROJECT_ROOT / 'src' / 'features' / 'technical.py', 'w') as f:
    f.write(feature_module)

print("Created: src/features/technical.py")

## Feature Engineering Complete

Next: Open `03_svm_training.ipynb`