In [1]:
%pip install lightgbm joblib

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from typing import List
import numpy as np
import joblib

from utils import (
    FEATURES_DATA_FILE_PATH,
    MODEL_FILE_PATH
)

TRAIN_START_DATE updated to: 2020-06-01, TRAIN_END_DATE updated to: 2024-07-05


In [3]:
# Load your full dataset with features
full_features_df = pd.read_parquet(FEATURES_DATA_FILE_PATH)
full_features_df['date'] = pd.to_datetime(full_features_df['date'])


# Model 1

In [4]:
valid_targets = np.array([0, 0.25, 0.5, 0.75, 1])


# Function to round predictions to the nearest valid target
def round_to_nearest_valid_target(y_pred, valid_targets):
    return np.array([min(valid_targets, key=lambda x: abs(x - pred)) for pred in y_pred])


In [5]:
def generate_identifiers_training_features() -> List[str]:
    return ['symbol_encoded', 'timestamp']

def generate_market_data_training_features() -> List[str]:
    return [
        'size_factor', 
        'liquidity_factor',
        'close'
        ]

def generate_market_data_7_training_features() -> List[str]:
    return ['ma_7', 'lag_7', 'momentum_7', 'pct_chg_7', 'ema_7', 'close_lag_7']

def generate_market_data_14_training_features() -> List[str]:
    return ['ma_14', 'lag_14', 'momentum_14', 'pct_chg_14', 'ema_14', 'close_lag_14']

def generate_market_data_30_training_features() -> List[str]:
    return ['ma_30', 'lag_30', 'momentum_30', 'pct_chg_30', 'ema_30', 'close_lag_30']

def generate_sentiment_data_training_features() -> List[str]:
    return ['is_active', 'is_open_source']

def generate_fear_greed_data_training_features() -> List[str]:
    return [
             'fear_greed_bucket', 
            'f_g_bucket_interact_ma_1', 
            'f_g_bucket_interact_ema_1', 
            'f_g_bucket_interact_close_lag_3'
            ]


def generate_macro_indicators_training_features() -> List[str]:
    return [ 'weighted_global_gdp',  'overall_interest_rate', 'interest_rate_trend', 'overall_inflation_rate', 'inflation_rate_trend']


In [6]:
from collections import defaultdict

models = {  'A1': generate_identifiers_training_features(), 

            'A2': generate_market_data_training_features(),      

            'A3': (generate_identifiers_training_features() +
                 generate_market_data_training_features()),

            'A4': (generate_identifiers_training_features() +
                 generate_market_data_training_features() + 
                 generate_fear_greed_data_training_features()),

            'A5': (generate_identifiers_training_features() +
                 generate_market_data_training_features() + 
                 generate_macro_indicators_training_features()),

            'A6': (generate_identifiers_training_features() +
                 generate_market_data_training_features() + 
                 generate_fear_greed_data_training_features() +
                 generate_macro_indicators_training_features()),

            'B1': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                generate_market_data_30_training_features()),

            'B2': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                generate_market_data_7_training_features()),

            'B3': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                generate_market_data_14_training_features()),


            'B4': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                ['ma_30']),

            'B5': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                ['momentum_30']),

            'B6': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                ['pct_chg_30']),

            'B7': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                generate_market_data_7_training_features() +
                generate_market_data_14_training_features() +
                generate_market_data_30_training_features()),

            'C1': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                generate_market_data_7_training_features() +
                generate_market_data_14_training_features() +
                generate_market_data_30_training_features() + 
                generate_sentiment_data_training_features()),

            'D1': (generate_identifiers_training_features() +
                generate_market_data_training_features()+ 
                generate_market_data_7_training_features() +
                generate_market_data_14_training_features() +
                generate_market_data_30_training_features() + 
                generate_sentiment_data_training_features() +
                generate_macro_indicators_training_features()),

            'D2': (generate_identifiers_training_features() + 
                   generate_market_data_training_features() + 
                   ['pct_chg_30', 'close_lag_30', 'close_lag_14', 
                    'close_lag_7', 'overall_interest_rate', 
                    'momentum_30', 'f_g_bucket_interact_ma_1', 
                    'f_g_bucket_interact_close_lag_3'])
            
        }

best_mse_model_key = None
best_r2_model_key = None
best_model = None
best_r2 = 0 
best_mse = float('inf')

# Dictionary to store cumulative feature importances
feature_importance_dict = defaultdict(float)


for key in models: 
    features = models[key]

    X_train, X_valid, y_train, y_valid = train_test_split(full_features_df[features], full_features_df["target"], test_size=0.2, random_state=42)

    model = lgb.LGBMRegressor(
        n_estimators=10000,
        learning_rate=0.01,
        max_depth=20,
        num_leaves=40,
        colsample_bytree=0.5,
        verbose=-1 
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='l2'
    )

    # Make predictions and evaluate the model
    y_pred = model.predict(X_valid)
    y_pred_rounded = round_to_nearest_valid_target(y_pred, valid_targets)

    # Calculate error metrics for the model
    mse = mean_squared_error(y_valid, y_pred_rounded)
    r2 = r2_score(y_valid, y_pred_rounded)
    mae = mean_absolute_error(y_valid, y_pred_rounded)
    nmse = mse / np.var(y_valid)  # Normalized Mean Squared Error

    print(f"Model {key} Performance Metrics:")
    print("-------------------=========================-----------------")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Normalized MSE (NMSE): {nmse}")
    print("-------------------=========================-----------------")

    # Print actual, predicted, and rounded predicted values
    print(f"Actual vs Predicted vs Rounded for Model {key}:")
    comparison_df = pd.DataFrame({
        'Actual': y_valid,            
        'Predicted': y_pred,          
        'Predicted_Rounded': y_pred_rounded  
    })
    print(comparison_df.head(30))  # Print the first 30 rows for comparison
    print("-------------------=========================-----------------")

    # Collect and print feature importance scores
    importance = model.feature_importances_
    print(f"Feature importances for Model {key}:")
    for i, feature in enumerate(features):
        print(f"{feature}: {importance[i]}")
        feature_importance_dict[feature] += importance[i]  # Accumulate importance across models

    print("-------------------=========================-----------------\n")


    if mse < best_mse:
        best_mse = mse
        best_mse_model_key = key

    if r2 > best_r2:
        best_r2 = r2
        best_r2_model_key = key
        best_model = model


# print the best model
print(f"MSE: The best model is {best_mse_model_key} with an MSE of {best_mse}")
print(f"R2: The best model is {best_r2_model_key} with an R2 of {best_r2}")


# Sort and print cumulative feature importances across all models
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("\nTop features by cumulative importance across all models:")
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")

Model A1 Performance Metrics:
Mean Squared Error (MSE): 0.07309198306863438
R-squared (R2): 0.030533354006834657
Mean Absolute Error (MAE): 0.21102843995205123
Normalized MSE (NMSE): 0.9694666870325365
Actual vs Predicted vs Rounded for Model A1:
         Actual  Predicted  Predicted_Rounded
18591      0.75   0.304872               0.25
609332     0.00   0.299923               0.25
573013     0.00   0.141417               0.25
836323     0.75   0.171823               0.25
987873     0.25   0.181233               0.25
906689     0.50   0.239710               0.25
764345     0.00   0.016053               0.00
964614     0.00   0.211029               0.25
153814     0.00   0.032390               0.00
65156      0.00   0.094973               0.00
599070     0.00   0.273582               0.25
1061563    0.50   0.094437               0.00
456798     0.75   0.137429               0.25
323325     0.50   0.208296               0.25
1008974    0.00   0.101277               0.00
1036514    0.75  

In [7]:
# Save the best model
joblib.dump(best_model, MODEL_FILE_PATH)

['../data/model.pkl']