In [1]:
# Cell 1: Imports & Configuration
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import math
import os
from datetime import datetime
from tqdm.notebook import tqdm

# Database
from sqlalchemy import create_engine
import pymysql

# Metrics and Preprocessing
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from statsmodels.tsa.arima.model import ARIMA

warnings.filterwarnings("ignore")
print("Libraries imported.")

# Database Configuration
db_config = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': '',  
    'database': 'trading_system'
}
db_url = f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}/{db_config['database']}"
engine = create_engine(db_url)

# Path Definition
base_dir = Path.cwd().parent
output_dir = base_dir / "4_Results" / "Ablation_Runs"
output_dir.mkdir(exist_ok=True)

print(f"Results will be saved to: {output_dir}")

Libraries imported.
Results will be saved to: c:\Users\18kyu\Desktop\Unishit\IR\4_Results\Ablation_Runs


In [2]:
# Cell 2: Load and Validate 'model_features' table from database
print("Loading 'model_features' table from database.")

try:
    df = pd.read_sql("SELECT * FROM model_features", con=engine)
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['ticker', 'date'])
    
    print(f"Successfully loaded {len(df):,} rows.")
    print(f"Data date range: {df['date'].min().date()} to {df['date'].max().date()}")
    print(f"Number of unique tickers: {df['ticker'].nunique()}")
    
    # Display available columns to verify
    print(f"\n Available columns in model_features:")
    print(df.columns.tolist())
    
    # Check for required columns
    required_cols = ['ticker', 'date', 'adj_close', 'target_return']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"ERROR: Missing required columns: {missing_cols}")
    
    # Check for NaN values in critical columns
    print(f"\n=== Data Quality Check ===")
    print(f"NaN in target_return: {df['target_return'].isna().sum()} ({df['target_return'].isna().sum()/len(df)*100:.2f}%)")
    print(f"NaN in adj_close: {df['adj_close'].isna().sum()}")
    
    # Display sample data
    print(f"\n=== Sample Data (First Ticker) ===")
    first_ticker = df['ticker'].iloc[0]
    print(df[df['ticker'] == first_ticker].head(3)[['ticker', 'date', 'adj_close', 'target_return', 'textblob_polarity', 'finbert_compound']].to_string())
    
except Exception as e:
    print(f"ERROR loading data: {e}")
    raise

Loading 'model_features' table from database.
Successfully loaded 1,855,010 rows.
Data date range: 2010-04-11 to 2020-05-31
Number of unique tickers: 4028

 Available columns in model_features:
['ticker', 'date', 'adj_close', 'volume', 'textblob_polarity', 'vader_compound', 'finbert_compound', 'news_count', 'has_news', 'days_since_news', 'sentiment_freshness', 'textblob_polarity_volatility', 'vader_compound_volatility', 'finbert_compound_volatility', 'prev_close', 'return', 'log_return', 'ma_3', 'ma_5', 'ma_10', 'volatility', 'rsi', 'momentum_3', 'momentum_5', 'finbert_momentum_3', 'vader_momentum_3', 'textblob_momentum_3', 'target_return']

=== Data Quality Check ===
NaN in target_return: 0 (0.00%)
NaN in adj_close: 0

=== Sample Data (First Ticker) ===
  ticker       date  adj_close  target_return  textblob_polarity  finbert_compound
0      A 2010-04-11    21.7974       0.030384           0.053013          0.038326
1      A 2010-04-18    22.4597       0.038843           0.053013     

In [3]:
# Cell 3: Define Feature Sets
print("\n Defining feature sets for model training.")

# 1. Price_Only - Technical features
price_features = [
    'prev_close',
    'ma_3',
    'ma_5',
    'ma_10',
    'volatility',
    'rsi',
    'momentum_3',
    'momentum_5',
    'volume'
]

# 2. Price_Base_Sentiment - Price + basic sentiment scores
base_sentiment_features = price_features + [
    'textblob_polarity',    # TextBlob sentiment
    'vader_compound',       # VADER sentiment
    'finbert_compound'      # FinBERT sentiment
]

# 3. Price_Full_Sentiment - Price + all sentiment features
full_sentiment_features = price_features + [
    # Base sentiment scores
    'textblob_polarity', 
    'vader_compound', 
    'finbert_compound',
    
    # News metadata
    'news_count',           # Number of news articles
    'has_news',             # Binary: any news this period
    'days_since_news',      # Days since last news
    'sentiment_freshness',  # Sentiment freshness score
    
    # Sentiment volatility
    'textblob_polarity_volatility',
    'vader_compound_volatility',
    'finbert_compound_volatility',
    
    # Sentiment momentum
    'finbert_momentum_3',
    'vader_momentum_3',
    'textblob_momentum_3'
]

# Store feature sets
feature_sets = {
    "Price_Only": price_features,
    "Price_Base_Sentiment": base_sentiment_features,
    "Price_Full_Sentiment": full_sentiment_features
}

# Summary of feature sets
print(f"Total feature sets: {len(feature_sets)}")
print(f"- Price_Only: {len(price_features)} features")
print(f"- Price_Base_Sentiment: {len(base_sentiment_features)} features")
print(f"- Price_Full_Sentiment: {len(full_sentiment_features)} features")

# Check if all features exist 
print(f"\n Validating features against available columns.")
all_features = set(full_sentiment_features)
available_cols = set(df.columns)
missing_features = all_features - available_cols

if missing_features:
    print(f"The following features are not in the data:")
    for feat in sorted(missing_features):
        print(f"  - {feat}")
    print("\n Need to update your feature lists.")
else:
    print("All features are available in the dataset.")
    
# Show feature breakdown
print(f"\n Feature Breakdown")
print(f"Price_Only features:")
for feat in price_features:
    print(f"{feat} is available.")
    
print(f"\nSentiment features added in Base:")
for feat in base_sentiment_features:
    if feat not in price_features:
        print(f"{feat} is available.")
        
print(f"\nAdditional features in Full:")
for feat in full_sentiment_features:
    if feat not in base_sentiment_features:
        print(f"{feat} is available.")


 Defining feature sets for model training.
Total feature sets: 3
- Price_Only: 9 features
- Price_Base_Sentiment: 12 features
- Price_Full_Sentiment: 22 features

 Validating features against available columns.
All features are available in the dataset.

 Feature Breakdown
Price_Only features:
prev_close is available.
ma_3 is available.
ma_5 is available.
ma_10 is available.
volatility is available.
rsi is available.
momentum_3 is available.
momentum_5 is available.
volume is available.

Sentiment features added in Base:
textblob_polarity is available.
vader_compound is available.
finbert_compound is available.

Additional features in Full:
news_count is available.
has_news is available.
days_since_news is available.
sentiment_freshness is available.
textblob_polarity_volatility is available.
vader_compound_volatility is available.
finbert_compound_volatility is available.
finbert_momentum_3 is available.
vader_momentum_3 is available.
textblob_momentum_3 is available.


In [None]:
# Cell 4: Define Helper Functions
def calculate_metrics(y_true, y_pred, model_name, ticker, feature_set_name):
    """Calculate metrics with safety checks including directional accuracy"""
    mask = ~(np.isnan(y_true) | np.isnan(y_pred) | np.isinf(y_true) | np.isinf(y_pred))
    y_true_clean = y_true[mask]
    y_pred_clean = y_pred[mask]
    
    if len(y_true_clean) == 0:
        return None
    
    rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
    mae = mean_absolute_error(y_true_clean, y_pred_clean)
    r2 = r2_score(y_true_clean, y_pred_clean)
    
    # Correct directional accuracy for returns
    correct_direction = np.sign(y_true_clean) == np.sign(y_pred_clean)
    directional_accuracy = np.mean(correct_direction) * 100
    
    return {
        'ticker': ticker,
        'model': f"{model_name}_{feature_set_name}",
        'feature_set': feature_set_name,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'directional_accuracy': directional_accuracy
    }
    
print("\n Helper functions defined.")


 Helper functions defined.


In [8]:
# Cell 5: Model Training Functions
# 1. Linear Models
def run_linear_models(train_df, test_df, feature_cols, feature_set_name, ticker):
    results = []
    models = {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(random_state=42),
        "Lasso": Lasso(random_state=42)
    }
    
    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_df[feature_cols])
    X_test = scaler.transform(test_df[feature_cols])
    y_train = train_df['target_return'].values
    y_test = test_df['target_return'].values
    
    for name, model in models.items():
        try:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            metrics = calculate_metrics(y_test, y_pred, name, ticker, feature_set_name)
            if metrics:
                results.append(metrics)
        except Exception as e:
            pass 
    return results

# 2. Tree-Based Models 
def run_tree_models(train_df, test_df, feature_cols, feature_set_name, ticker):
    results = []
    models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
        "XGBoost_Light": xgb.XGBRegressor(n_estimators=50, max_depth=3, learning_rate=0.1, objective='reg:squarederror', random_state=42),
        "XGBoost_Medium": xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, objective='reg:squarederror', random_state=42),
    }
    
    # Scaling data for XGBoost
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_df[feature_cols])
    X_test = scaler.transform(test_df[feature_cols])
    y_train = train_df['target_return'].values
    y_test = test_df['target_return'].values
    
    for name, model in models.items():
        try:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            metrics = calculate_metrics(y_test, y_pred, name, ticker, feature_set_name)
            if metrics:
                results.append(metrics)
        except Exception as e:
            pass
    return results

# 3. SVM Models
def run_svm_models(train_df, test_df, feature_cols, feature_set_name, ticker):
    results = []
    model = SVR(kernel='rbf', C=1.0, gamma='scale')
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_df[feature_cols])
    X_test = scaler.transform(test_df[feature_cols])
    y_train = train_df['target_return'].values
    y_test = test_df['target_return'].values
    
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        metrics = calculate_metrics(y_test, y_pred, "SVR", ticker, feature_set_name)
        if metrics:
            results.append(metrics)
    except Exception as e:
        pass
    return results

# 4. MLP Models 
def run_mlp_models(train_df, test_df, feature_cols, feature_set_name, ticker):
    results = []
    configs = [
        {'hidden_layer_sizes': (50,), 'name': 'MLP_Small'},
        {'hidden_layer_sizes': (64, 32), 'name': 'MLP_Medium'},
    ]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_df[feature_cols])
    X_test = scaler.transform(test_df[feature_cols])
    y_train = train_df['target_return'].values
    y_test = test_df['target_return'].values
    
    for config in configs:
        try:
            model = MLPRegressor(
                hidden_layer_sizes=config['hidden_layer_sizes'], 
                max_iter=500, random_state=42, early_stopping=True
            )
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            metrics = calculate_metrics(y_test, y_pred, config['name'], ticker, feature_set_name)
            if metrics:
                results.append(metrics)
        except Exception as e:
            pass
    return results

# 5. ARIMA Models
def run_arima_models(train_df, test_df, ticker):
    results = []
    # Target variable
    y_train = train_df['target_return']
    y_test = test_df['target_return']
    
    # 1. ARIMA (Price Only)
    try:
        model_arima = ARIMA(y_train, order=(5, 1, 0)) # Using a standard p,d,q
        model_fit = model_arima.fit()
        forecast = model_fit.forecast(steps=len(y_test))
        metrics = calculate_metrics(y_test.values, forecast.values, "ARIMA(5,1,0)", ticker, "Price_Only")
        if metrics:
            results.append(metrics)
    except Exception as e:
        pass
        
    # 2. ARIMAX (Full Sentiment Features)
    try:
        exog_cols = [c for c in full_sentiment_features if c not in price_features]
        exog_train = train_df[exog_cols]
        exog_test = test_df[exog_cols]
        
        model_arimax = ARIMA(y_train, order=(5, 1, 0), exog=exog_train)
        model_fit = model_arimax.fit()
        forecast = model_fit.forecast(steps=len(y_test), exog=exog_test)
        metrics = calculate_metrics(y_test.values, forecast.values, "ARIMAX(5,1,0)", ticker, "Price_Full_Sentiment")
        if metrics:
            results.append(metrics)
    except Exception as e:
        pass
        
    return results

print("All model-training functions defined.")

All model-training functions defined.


In [9]:
# Cell 6: Main Training Loop
print("Starting Model Training Loop.")

all_results = []
# Get the unique, sorted list of all tickers
ticker_list = sorted(df['ticker'].unique())
print(f"Will train models for {len(ticker_list)} tickers.")

for ticker in tqdm(ticker_list, desc="Processing All Tickers"):
    ticker_df = df[df['ticker'] == ticker]
    
    # 1. Split data for this ticker
    split_idx = int(len(ticker_df) * 0.8)
    train_df = ticker_df.iloc[:split_idx]
    test_df = ticker_df.iloc[split_idx:]
    
    # Skip tickers with too little data
    if len(train_df) < 50 or len(test_df) < 5:
        continue
        
    # 2. Iterate through our 3 feature sets
    for set_name, feature_cols in feature_sets.items():
        
        # 3. Run all model functions
        all_results.extend( run_linear_models(train_df, test_df, feature_cols, set_name, ticker) )
        all_results.extend( run_svm_models(train_df, test_df, feature_cols, set_name, ticker) )
        
        if set_name != "Price_Base_Sentiment":
             all_results.extend( run_tree_models(train_df, test_df, feature_cols, set_name, ticker) )
             all_results.extend( run_mlp_models(train_df, test_df, feature_cols, set_name, ticker) )
    
    # 4. Run ARIMA models
    all_results.extend( run_arima_models(train_df, test_df, ticker) )

print(f"\n Model Training Complete. Total results generated: {len(all_results)}.")

Starting Model Training Loop.
Will train models for 4028 tickers.


Processing All Tickers:   0%|          | 0/4028 [00:00<?, ?it/s]


 Model Training Complete. Total results generated: 95064.


In [11]:
# Cell 7: Save Results to CSV
print("Saving all model results.")

if all_results:
    # 1. Convert to DataFrame
    results_df = pd.DataFrame(all_results)
    results_df['timestamp'] = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # 2. Save to new CSV folder
    csv_path = output_dir / "03A_IR_Model_Performance.csv"
    results_df.to_csv(csv_path, index=False)
    print(f"Successfully saved {len(results_df)} results to {csv_path}")
    
    # 3. Save to Database
    print("Saving results to 'results_ir_models' table in database.")
    try:
        results_df.to_sql(
            'results_ir_models',
            con=engine,
            if_exists='replace',
            index=False,
            chunksize=1000
        )
        print("Successfully wrote results to database!")
    except Exception as e:
        print(f"Error writing results to database: {e}")
else:
    print("No results were generated. Skipping save.")

Saving all model results.
Successfully saved 95064 results to c:\Users\18kyu\Desktop\Unishit\IR\4_Results\Ablation_Runs\03A_IR_Model_Performance.csv
Saving results to 'results_ir_models' table in database.
Successfully wrote results to database!


In [12]:
# Cell 8: Clean Results for R Analysis
print("\n Cleaning Results for R analysis.")

# Load saved file
csv_path = output_dir / "03A_IR_Model_Performance.csv"
df_raw_results = pd.read_csv(csv_path)

# 1. Create a summary by model (mean of all tickers)
model_summary = df_raw_results.groupby('model').agg(
    rmse_mean=('rmse', 'mean'),
    mae_mean=('mae', 'mean'),
    r2_mean=('r2', 'mean'),
    directional_accuracy_mean=('directional_accuracy', 'mean'),
    model_count=('ticker', 'count')
).reset_index()

# 2. Port the logic from clean_ablation_data
def extract_model_info(model_name):
    parts = model_name.split('_')
    model_type = parts[0]
    
    if 'ARIMA' in model_type:
        model_family = 'ARIMA+ARIMAX'
        config = parts[1]
        feature_set = '_'.join(parts[2:])
    elif 'MLP' in model_type or 'XGBoost' in model_type:
        model_family = model_type
        config = parts[1]
        feature_set = '_'.join(parts[2:])
    else:
        model_family = model_type.replace('LinearRegression', 'Linear Regression')
        config = 'Standard'
        feature_set = '_'.join(parts[1:])
        
    if 'Price_Full_Sentiment' in feature_set:
        sentiment_type = 'Full Sentiment'
    elif 'Price_Base_Sentiment' in feature_set:
        sentiment_type = 'Base Sentiment'
    elif 'Price_Only' in feature_set:
        sentiment_type = 'Price Only'
    else:
        sentiment_type = 'Other'
        
    return model_family, config, sentiment_type

model_info = model_summary['model'].apply(extract_model_info)
model_summary['model_family'] = [info[0] for info in model_info]
model_summary['model_config'] = [info[1] for info in model_info]
model_summary['sentiment_type'] = [info[2] for info in model_info]
model_summary['model_variation'] = model_summary.apply(
    lambda row: f"{row['model_family']}_{row['model_config']}" if row['model_config'] != 'Standard' 
    else row['model_family'], axis=1
)

# 3. Calculate Improvements (relative to Price_Only)
base_models = model_summary[model_summary['sentiment_type'] == 'Price Only'].copy()
base_models = base_models.set_index('model_variation')[['rmse_mean', 'r2_mean']]

def calculate_improvements(row):
    model_var = row['model_variation']
    if model_var in base_models.index and row['sentiment_type'] != 'Price Only':
        base_rmse = base_models.loc[model_var, 'rmse_mean']
        base_r2 = base_models.loc[model_var, 'r2_mean']
        
        rmse_imp = ((base_rmse - row['rmse_mean']) / base_rmse) * 100
        r2_imp = ((row['r2_mean'] - base_r2) / abs(base_r2)) * 100
        return rmse_imp, r2_imp
    return 0, 0

improvements = model_summary.apply(calculate_improvements, axis=1)
model_summary['rmse_improvement_pct'] = [imp[0] for imp in improvements]
model_summary['r2_improvement_pct'] = [imp[1] for imp in improvements]
model_summary['rmse_improved'] = (model_summary['rmse_improvement_pct'] > 0).astype(int)
model_summary['r2_improved'] = (model_summary['r2_improvement_pct'] > 0).astype(int)
summary_csv_path = base_dir / "4_Results" / "cleaned_ablation_results.csv"
model_summary.to_csv(summary_csv_path, index=False)

print(f"Successfully cleaned and saved summary to {summary_csv_path}")
print("\n Phase 3A Complete.")


 Cleaning Results for R analysis.
Successfully cleaned and saved summary to c:\Users\18kyu\Desktop\Unishit\IR\4_Results\cleaned_ablation_results.csv

 Phase 3A Complete.


In [13]:
print("--- Model Performance Validation ---")

# Load the summary file you just created in Cell 8
summary_path = base_dir / "4_Results" / "cleaned_ablation_results.csv"

try:
    summary_df = pd.read_csv(summary_path)

    print("Successfully loaded cleaned summary file.")
    print(f"Total model variations tested: {len(summary_df)}")

    # Sort by the best R-squared (higher is better)
    print("\n=== Top 10 Models by R-squared (R²) ===")
    print(summary_df.sort_values(by='r2_mean', ascending=False).head(10)[
        ['model', 'r2_mean', 'directional_accuracy_mean']
    ].to_string())

    # Sort by the best RMSE Improvement (higher is better)
    print("\n=== Top 10 Models by RMSE Improvement (vs. Price_Only) ===")
    print(summary_df.sort_values(by='rmse_improvement_pct', ascending=False).head(10)[
        ['model', 'rmse_improvement_pct', 'r2_improvement_pct']
    ].to_string())

    # Check the worst models
    print("\n=== Bottom 5 Models by R-squared (R²) ===")
    print(summary_df.sort_values(by='r2_mean', ascending=True).head(5)[
        ['model', 'r2_mean']
    ].to_string())

except FileNotFoundError:
    print(f"ERROR: Could not find the summary file at {summary_path}")
except Exception as e:
    print(f"An error occurred: {e}")

--- Model Performance Validation ---
Successfully loaded cleaned summary file.
Total model variations tested: 24

=== Top 10 Models by R-squared (R²) ===
                                  model     r2_mean  directional_accuracy_mean
0               ARIMA(5,1,0)_Price_Only   -0.149811                  50.782327
19                       SVR_Price_Only   -0.805193                  47.981469
18             SVR_Price_Full_Sentiment   -0.860055                  47.973120
17             SVR_Price_Base_Sentiment   -1.928091                  47.917084
20   XGBoost_Light_Price_Full_Sentiment   -3.010012                  49.120065
22  XGBoost_Medium_Price_Full_Sentiment   -5.533572                  48.913564
12    RandomForest_Price_Full_Sentiment  -29.887717                  48.692338
13              RandomForest_Price_Only -102.311530                  48.607447
4                      Lasso_Price_Only -108.777695                  52.770582
16                     Ridge_Price_Only -198.111618     