In [1]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from typing import List
import numpy as np

from utils import (
    TRAIN_TARGETS_PARQUET_FILE_PATH,
    FEATURES_DATA_FILE_PATH
)

TRAIN_START_DATE updated to: 2020-06-01, TRAIN_END_DATE updated to: 2024-07-05


In [2]:

# Get the historical targets file
train_df = pd.read_parquet(TRAIN_TARGETS_PARQUET_FILE_PATH)
train_df['date'] = pd.to_datetime(train_df['date'])

# Load your full dataset with features
full_features_df = pd.read_parquet(FEATURES_DATA_FILE_PATH)
full_features_df['date'] = pd.to_datetime(full_features_df['date'])

# Merge the datasets on common keys (e.g., 'symbol' and 'date')
train_df = pd.merge(train_df, full_features_df, on=['symbol', 'date'], how='inner')



# Model 1

In [3]:
valid_targets = np.array([0, 0.25, 0.5, 0.75, 1])


# Function to round predictions to the nearest valid target
def round_to_nearest_valid_target(y_pred, valid_targets):
    return np.array([min(valid_targets, key=lambda x: abs(x - pred)) for pred in y_pred])


In [4]:
def generate_identifiers_training_features() -> List[str]:
    return ['symbol_encoded', 'timestamp']

def generate_market_data_training_features() -> List[str]:
    return [
        'size_factor', 
        'liquidity_factor',
        #   'close'
          ]

def generate_market_data_7_training_features() -> List[str]:
    return ['ma_7', 'lag_7', 'momentum_7', 'pct_chg_7', 'ema_7', 'close_lag_7']

def generate_market_data_14_training_features() -> List[str]:
    return ['ma_14', 'lag_14', 'momentum_14', 'pct_chg_14', 'ema_14', 'close_lag_14']

def generate_market_data_30_training_features() -> List[str]:
    return ['ma_30', 'lag_30', 'momentum_30', 'pct_chg_30', 'ema_30', 'close_lag_30']

def generate_sentiment_data_training_features() -> List[str]:
    return ['is_active', 'is_open_source']

def generate_fear_greed_data_training_features() -> List[str]:
    return [
        # 'fear_greed_bucket', 
            'f_g_bucket_interact_ma_1', 
            'f_g_bucket_interact_ema_1', 
            'f_g_bucket_interact_close_lag_7'
            ]


def generate_macro_indicators_training_features() -> List[str]:
    return [ 'weighted_global_gdp',  'overall_interest_rate', 'interest_rate_trend', 'overall_inflation_rate', 'inflation_rate_trend']


In [5]:
# Model A features: 
model_a_features = (
    generate_identifiers_training_features() +
    generate_market_data_training_features() 
    # generate_sentiment_data_training_features()
)

# Model B features: 
model_b_features = (
    model_a_features + 
    generate_market_data_7_training_features() +
    generate_market_data_14_training_features() +
    generate_market_data_30_training_features()
)

# Model C features: 
model_c_features = (
    model_b_features + generate_sentiment_data_training_features()
)

# Model d features: 
model_d_features = (
    model_c_features + generate_macro_indicators_training_features()
)

In [6]:
models = {'A': model_a_features, 'B': model_b_features, 'C': model_c_features, 'D': model_d_features}

for key in models: 
    features = models[key]

    X_train, X_valid, y_train, y_valid = train_test_split(train_df[features], train_df["close"], test_size=0.2, random_state=42)

    model = lgb.LGBMRegressor(
        n_estimators=10000,
        learning_rate=0.01,
        max_depth=20,
        num_leaves=40,
        colsample_bytree=0.5,
        verbose=-1 
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='l2'
    )

    # Make predictions and evaluate the model
    y_pred = model.predict(X_valid)
    y_pred_rounded = round_to_nearest_valid_target(y_pred, valid_targets)

    # Calculate error metrics for the model
    mse = mean_squared_error(y_valid, y_pred_rounded)
    r2 = r2_score(y_valid, y_pred_rounded)
    mae = mean_absolute_error(y_valid, y_pred_rounded)
    nmse = mse / np.var(y_valid)  # Normalized Mean Squared Error

    print(f"Model {key} Performance Metrics:")
    print("-------------------=========================-----------------")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R-squared (R2): {r2}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Normalized MSE (NMSE): {nmse}")
    print("-------------------=========================-----------------")

    # Print actual, predicted, and rounded predicted values
    print(f"Actual vs Predicted vs Rounded for Model {key}:")
    comparison_df = pd.DataFrame({
        'Actual': y_valid,            
        'Predicted': y_pred,          
        'Predicted_Rounded': y_pred_rounded  
    })
    print(comparison_df.head(30))  # Print the first 30 rows for comparison
    print("-------------------=========================-----------------")
