LightGBM


In [35]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import joblib
import lightgbm as lgb


# Create a dictionary to store data for each horizon
test_data = {}
horizons = [1, 5, 21]

# Load and filter test data for each horizon on date '2020-10-10'
for horizon in horizons:
    file_path = f'C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Data/merged_results_h{horizon}.csv'
    df = pd.read_csv(file_path)

    # Filter to 2020-10-10 only
    df = df[df['date'] == '2020-05-01']

    # Select option-only columns
    option_columns = [
        'cp_flag',
        'stock_price',
        'moneyness',
        'time_to_expiry',
        'strike_price',
        'rf',
        'delta',
        'gamma',
        'vega',
        'theta',
        'iv_ahbs',
        'iv_ahbs_error',
        'iv_bs',
        'iv_bs_error',
        'iv_cw',
        'iv_cw_error',
        'impl_volatility'
    ]

    # Store in dictionary
    test_data[horizon] = df

    # Confirm the result
    print(f"\nFiltered Option-only data for horizon {horizon} on 2020-10-10:")
    print(f"Shape: {test_data[1].shape}")
    print(test_data[1].head())

# If you want to load your train data as well (not filtered by date)
train_data = pd.read_csv('C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Data/merged_results_train.csv')
train_data = train_data.drop(columns=['date', 'moneyness_category'], errors='ignore')
print(f"\nTrain data shape: {train_data.shape}")



Filtered Option-only data for horizon 1 on 2020-10-10:
Shape: (1281, 176)
                                   ID     iv_cw        date  cp_flag  \
990379  2020-05-01_AAPL 200529C292500  0.450584  2020-05-01        1   
990380  2020-05-01_AAPL 200529C295000  0.447356  2020-05-01        1   
990381  2020-05-01_AAPL 200529C297500  0.444564  2020-05-01        1   
990382  2020-05-01_AAPL 200529C300000  0.442207  2020-05-01        1   
990383  2020-05-01_AAPL 200529C305000  0.438777  2020-05-01        1   

        option_price  stock_price  moneyness  time_to_expiry  strike_price  \
990379         8.425      292.425   0.999744              27         292.5   
990380         7.225      292.425   0.991271              27         295.0   
990381         6.200      292.425   0.982941              27         297.5   
990382         5.275      292.425   0.974750              27         300.0   
990383         3.625      292.425   0.958770              27         305.0   

        volume  ...   i

In [36]:
###########################################
# PART 1: LIGHTGBM MODEL DEFINITION
###########################################

def create_lgb_model(model_type):
    """
    Create LightGBM models with configurations based on model type.
    
    Parameters:
    model_type (str): One of 'LGB1' or 'LGB2', specifying which configuration to use
    
    Returns:
    model: LightGBM model with specified parameters
    """
    if model_type == 'LGB1':
        # Standard LightGBM configuration
        params = {
            'objective': 'regression',
            'metric': 'mse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1
        }
    
    elif model_type == 'LGB2':
        # More complex LightGBM configuration with different hyperparameters
        params = {
            'objective': 'regression',
            'metric': 'mse',
            'boosting_type': 'gbdt',
            'num_leaves': 63,
            'learning_rate': 0.01,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.7,
            'bagging_freq': 4,
            'min_data_in_leaf': 20,
            'max_depth': 10,
            'verbose': -1
        }
    
    else:
        raise ValueError("Invalid model type. Choose from 'LGB1' or 'LGB2'.")
    
    return params

def train_and_evaluate_model(params, X_train, y_train, X_test, y_test, num_boost_round=100):
    """
    Train and evaluate a LightGBM model.
    
    Parameters:
    params: LightGBM parameters dictionary
    X_train: Training features
    y_train: Training target values
    X_test: Test features
    y_test: Test target values
    num_boost_round: Number of boosting iterations
    
    Returns:
    model: Trained LightGBM model
    mse: Mean squared error on test data
    """
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    # Train model with early stopping
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=20, verbose=True)]
    )
    
    # Evaluate model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return model, mse

In [37]:
###########################################
# PART 2: DATA PREPARATION
###########################################

import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def prepare_data(option_only: pd.DataFrame, save_paths: bool = True):
    """
    Prepare features and target variables, split data, scale features, and optionally save scaler and feature list.
    Handles three target variables: iv_bs_error, iv_ahbs_error, iv_cw_error.

    Parameters:
    option_only (pd.DataFrame): Input DataFrame containing option data.
    save_paths (bool): Whether to persist scaler and feature_columns to disk.

    Returns:
    tuple: (
        X_train_scaled (pd.DataFrame),
        X_test_scaled (pd.DataFrame),
        y_train_dict (dict),
        y_test_dict (dict),
        scaler (StandardScaler),
        feature_columns (list)
    )
    """
    # 1) Define features and targets
    exclude_cols = [
        'iv_bs_error', 'iv_ahbs', 'iv_ahbs_error',
        'iv_bs', 'iv_cw', 'iv_cw_error',
        'impl_volatility'
    ]
    feature_columns = [col for col in option_only.columns if col not in exclude_cols]
    X = option_only[feature_columns]
    y_train_dict = {
        'bs': option_only['iv_bs_error'],
        'ahbs': option_only['iv_ahbs_error'],
        'cw': option_only['iv_cw_error']
    }
    y_test_dict = y_train_dict.copy()

    # 2) One consistent train/test split by index
    train_idx, test_idx = train_test_split(
        option_only.index, test_size=0.2, random_state=42
    )
    X_train = X.loc[train_idx]
    X_test = X.loc[test_idx]
    y_train_dict = {k: v.loc[train_idx] for k, v in y_train_dict.items()}
    y_test_dict  = {k: v.loc[test_idx]  for k, v in y_test_dict.items()}

    # 3) Scale features
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(
        scaler.transform(X_train),
        columns=feature_columns,
        index=train_idx
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test),
        columns=feature_columns,
        index=test_idx
    )

    # 4) Persist scaler and feature list for later use
    if save_paths:
        joblib.dump(scaler, 'scaler.pkl')
        joblib.dump(feature_columns, 'feature_columns.pkl')

    return X_train_scaled, X_test_scaled, y_train_dict, y_test_dict, scaler, feature_columns


In [38]:
###########################################
# PART 3: PREDICTION FUNCTION
###########################################

def predict_and_add_to_test_data(models, test_data, scaler, feature_columns, error_type):
    """
    Prepare test data, generate predictions and add them to the test dataset.
    
    Parameters:
    models (dict): Dictionary containing trained models with keys as model names
    test_data (pandas.DataFrame): Test dataset to make predictions on
    scaler (StandardScaler): Fitted scaler to transform the test features
    feature_columns (list): List of column names to use as features
    error_type (str): Type of error being predicted ('bs', 'ahbs', or 'cw')
    
    Returns:
    pandas.DataFrame: Test dataset with added prediction columns
    """
    # Create a copy of the test data to avoid modifying the original
    result_df = test_data.copy()
    
    # Extract features from test data
    X_test = test_data[feature_columns]
    
    # Scale the features using the pre-fitted scaler
    X_test_scaled = scaler.transform(X_test)
    
    # Original value column name
    original_column = f'iv_{error_type}'
    
    # Generate predictions for each model
    for model_name, model in models.items():
        # Make predictions
        predictions = model.predict(X_test_scaled)
        
        # Add predictions to the dataframe
        column_name = f'iv_{error_type}_pred_{model_name}'
        result_df[column_name] = predictions
        
        # Calculate corrected value by adding the error prediction to the original value
        result_df[f'iv_{error_type}_corrected_{model_name}'] = result_df[original_column] + predictions
    
    return result_df

In [39]:
###########################################
# PART 4: FEATURE IMPORTANCE ANALYSIS
###########################################

def analyze_feature_importance(model, feature_columns):
    """
    Extract and display feature importance from a LightGBM model.
    
    Parameters:
    model: Trained LightGBM model
    feature_columns (list): List of feature column names
    
    Returns:
    pandas.DataFrame: DataFrame with feature importance scores
    """
    # Get feature importance
    importance = model.feature_importance(importance_type='split')
    
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': importance
    })
    
    # Sort by importance
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    return importance_df


In [40]:
import os

def load_lgb_models(model_paths_dict):
    loaded_models = {}

    for error_type in model_paths_dict:
        loaded_models[error_type] = {}

        for model_name, path in model_paths_dict[error_type].items():
            try:
                path = path.replace('\\', '/')  # Normalize path
                if not os.path.exists(path):
                    print(f"✗ {error_type} {model_name}: File not found at {path}")
                    loaded_models[error_type][model_name] = None
                    continue

                print(f"Loading {error_type} {model_name}...")
                model = lgb.Booster(model_file=path)
                print(f"✓ Model loaded successfully")

                loaded_models[error_type][model_name] = model

            except Exception as e:
                print(f"✗ Failed to load {error_type} {model_name}: {str(e)}")
                loaded_models[error_type][model_name] = None

    return loaded_models

# Example usage
if __name__ == "__main__":
    # Model paths dictionary with proper path handling for Windows
    # Use forward slashes or raw strings to avoid Unicode escape errors
    model_paths_dict = {
    'bs': {
        'LGB1': r"C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Code/Models/LigthGBM Forecast/Firm Characteristics/LGB1_bs_model.txt",
        'LGB2': r"C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Code/Models/LigthGBM Forecast/Firm Characteristics/LGB2_bs_model.txt"
    },
    'ahbs': {
        'LGB1': r"C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Code/Models/LigthGBM Forecast/Firm Characteristics/LGB1_ahbs_model.txt",
        'LGB2': r"C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Code/Models/LigthGBM Forecast/Firm Characteristics/LGB2_ahbs_model.txt"
    },
    'cw': {
        'LGB1': r"C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Code/Models/LigthGBM Forecast/Firm Characteristics/LGB1_cw_model.txt",
        'LGB2': r"C:/Users/maxva/OneDrive - Tilburg University/Msc. Data Science/Master Thesis/Code/Models/LigthGBM Forecast/Firm Characteristics/LGB2_cw_model.txt"
    }
    }

    
    # Load all models
    loaded_models = load_lgb_models(model_paths_dict)
    
    # Print summary of loaded models
    print("\n=== Model Loading Summary ===")
    for error_type in loaded_models:
        for model_name in loaded_models[error_type]:
            status = "✓ Loaded" if loaded_models[error_type][model_name] is not None else "✗ Failed"
            print(f"{error_type} {model_name}: {status}")

Loading bs LGB1...
✓ Model loaded successfully
Loading bs LGB2...
✓ Model loaded successfully
Loading ahbs LGB1...
✓ Model loaded successfully
Loading ahbs LGB2...
✓ Model loaded successfully
Loading cw LGB1...
✓ Model loaded successfully
Loading cw LGB2...
✓ Model loaded successfully

=== Model Loading Summary ===
bs LGB1: ✓ Loaded
bs LGB2: ✓ Loaded
ahbs LGB1: ✓ Loaded
ahbs LGB2: ✓ Loaded
cw LGB1: ✓ Loaded
cw LGB2: ✓ Loaded


In [41]:
test_data_per_horizon = {}
import joblib

# Define the scaler path
scaler_path = r"C:\Users\maxva\OneDrive - Tilburg University\Msc. Data Science\Master Thesis\Code\Models\LigthGBM Forecast\Firm Characteristics\scaler.pkl"
scaler = joblib.load(scaler_path)



# 1) Copy base test set
base_df = test_data[1].copy()

# 2) Define features (remove target + metadata columns)
exclude_cols = [
    'iv_bs_error','iv_ahbs','iv_ahbs_error','iv_bs','iv_cw','iv_cw_error',
    'impl_volatility','moneyness_category','ID','date','new_id',
    'open_interest','option_price','prediction_horizon',
    'test_date','train_date','volume', 'iv_ahbs_corrected_LGB1', 'iv_ahbs_corrected_LGB2', 'iv_ahbs_pred_LGB1', 'iv_ahbs_pred_LGB2',
    'iv_bs_corrected_LGB1', 'iv_bs_corrected_LGB2',
    'iv_bs_pred_LGB1', 'iv_bs_pred_LGB2', 'iv_cw_corrected_LGB1', 'iv_cw_corrected_LGB2', 'iv_cw_pred_LGB1', 'iv_cw_pred_LGB2'
]
feature_columns = [c for c in base_df.columns if c not in exclude_cols]

# 3) Add model predictions and corrections
df = base_df.copy()
for error_type in ['bs', 'ahbs', 'cw']:
    print(f"\n-- Predicting corrections for {error_type.upper()} --")
    
    models_for_type = loaded_models[error_type]  # {'LGB1': model_obj, 'NN4': model_obj}
    
    # This helper must add pred/corrected cols in-place
    df = predict_and_add_to_test_data(
        models=models_for_type,
        test_data=df,
        scaler=scaler,
        feature_columns=feature_columns,
        error_type=error_type
    )
    
    # Preview new prediction columns
    cols = ['impl_volatility', f'iv_{error_type}']
    for model in ['LGB1', 'LGB2']:
        cols += [f'iv_{error_type}_pred_{model}', f'iv_{error_type}_corrected_{model}']
    print(df[cols].head(3))

# 4) Store final result for this horizon
test_data[horizon] = df.copy()
print(f"\n✓ option_only_data[{horizon}] updated.")



-- Predicting corrections for BS --
        impl_volatility     iv_bs  iv_bs_pred_LGB1  iv_bs_corrected_LGB1  \
990379         0.333655  0.466571        -0.109720              0.356850   
990380         0.328039  0.466571        -0.119451              0.347120   
990381         0.320756  0.466571        -0.127325              0.339246   

        iv_bs_pred_LGB2  iv_bs_corrected_LGB2  
990379        -0.112547              0.354023  
990380        -0.116486              0.350085  
990381        -0.119476              0.347094  

-- Predicting corrections for AHBS --
        impl_volatility   iv_ahbs  iv_ahbs_pred_LGB1  iv_ahbs_corrected_LGB1  \
990379         0.333655  0.466143          -0.100951                0.365192   
990380         0.328039  0.463298          -0.104567                0.358731   
990381         0.320756  0.460615          -0.105047                0.355569   

        iv_ahbs_pred_LGB2  iv_ahbs_corrected_LGB2  
990379          -0.093235                0.372908  
99

In [48]:
df.to_csv('test_data_horizon_1_for_graph.csv', index=False)


In [43]:
import matplotlib.pyplot as plt

def plot_extended_model_comparisons(df):
    models = {
        'BS': ('iv_bs', 'iv_bs_corrected_NN3', 'iv_bs_corrected_LGB1'),
        'AHBS': ('iv_ahbs', 'iv_ahbs_corrected_NN3', 'iv_ahbs_corrected_LGB1'),
        'Carr and Wu': ('iv_cw', 'iv_cw_corrected_NN3', 'iv_cw_corrected_LGB1')
    }

    fig, axs = plt.subplots(3, len(models), figsize=(5 * len(models), 12))

    for idx, (model_name, (base_col, nn3_col, lgb1_col)) in enumerate(models.items()):
        # Top row: Base model vs Market
        axs[0, idx].scatter(df['moneyness'], df['impl_volatility'], color='blue', label='Market IV', s=10, alpha=0.6)
        axs[0, idx].scatter(df['moneyness'], df[base_col], color='orange', label=f'{model_name} IV', s=10, alpha=0.6)
        axs[0, idx].set_title(f'{model_name}')
        axs[0, idx].set_xlabel('Moneyness')
        axs[0, idx].set_ylabel('Implied Volatility')
        axs[0, idx].legend()

        # Middle row: NN3 Corrected model vs Market
        axs[1, idx].scatter(df['moneyness'], df['impl_volatility'], color='blue', label='Market IV', s=10, alpha=0.6)
        axs[1, idx].scatter(df['moneyness'], df[nn3_col], color='orange', label=f'{model_name} + NN3', s=10, alpha=0.6)
        axs[1, idx].set_title(f'{model_name} + NN3')
        axs[1, idx].set_xlabel('Moneyness')
        axs[1, idx].set_ylabel('Implied Volatility')
        axs[1, idx].legend()

        # Bottom row: LGB1 Corre
