In [1]:
import numpy as np
import lightgbm as lgb
import polars as pl
import plotly.express as px
from pathlib import Path

from scipy.stats import pearsonr
import pickle

data_path = "/Users/Sigrid/Desktop/JS"
feature_tags = pl.read_csv(Path(data_path, 'features.csv'))

# for each training set, we take 20% of the data for validation
frac_train = 0.8
train_raw_data_num = ["5", "6", "7", "8", "9"]
# a completely new dataset for testing
test_raw_data_num = "1"
test_data = pl.read_parquet(Path(data_path, "train.parquet", f"partition_id={test_raw_data_num}", "part-0.parquet"))

def sample_weighted_r2(y_pred, y_truth, weight):
    """
    Zero-mean R-squared metrics.

    Args:
        y_pred: Array of predicted values.
        y_truth: Array of true values.
        weight: Array of sample weights.

    Returns:
        1-corr: Zero-mean R-squared.
    """

    # Ensure weights are valid
    weight = weight if weight is not None else np.ones_like(y_pred)
    
    corr = np.sum((weight * (y_truth - y_pred) ** 2)) / np.sum(weight * y_truth ** 2)
    
    return 1 - corr 

# Set parameters for LightGBM, rmse
params_rmse = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

In [2]:
## 0.025: 0.0038
## 0.01: 0.0042

train_feature_list = ["time_id", "symbol_id"] + [f"feature_{idx:02d}" for idx in range(79)]

model_name = 'jane_lgbm_baseline.txt'

# initialize the model
model = None

evals_result = {}
training_loss = []
validation_loss = []

for i in train_raw_data_num:
    training_data = pl.read_parquet(Path(data_path, "train.parquet", f"partition_id={i}", "part-0.parquet"))
    training_data = training_data.with_columns((np.exp(-0.01*(training_data['date_id'].max() - training_data['date_id']))).alias('time_decay'))  ## adding time-decay
    training_data = training_data.with_columns((training_data['weight']*training_data['time_decay']).alias('weight_new'))
    training_data = training_data.rename({'weight': 'weight_old', 'weight_new': 'weight'})
    print("Size of training data (GB):", training_data.estimated_size("gb"))

    #################################################################################################
    ####################   Preprocess the training data and select features   #######################
    #################################################################################################
    training_data = training_data.fill_null(0)
    training_data_subset = training_data.select([col for col in training_data.columns if col in train_feature_list])
    #################################################################################################
    label = training_data.select(pl.col("responder_6"))
    weight = training_data.select(pl.col("weight"))
    del training_data  # save memory
    # Split the data into training and validation sets
    split_index = int(frac_train * training_data_subset.shape[0])
    training_data_loader = lgb.Dataset(training_data_subset[:split_index], label=label[:split_index].to_numpy(),
                                       weight=weight[:split_index].to_numpy())
    
    validate_data_loader = lgb.Dataset(training_data_subset[split_index:], label=label[split_index:].to_numpy(),
                                       reference=training_data_loader, weight=weight[split_index:].to_numpy())
    
    # Train the model
    model = lgb.train(params_rmse, training_data_loader, init_model=model, num_boost_round=10,
                      valid_sets=[training_data_loader, validate_data_loader],
                      valid_names=['train', 'val'],
                      callbacks=[lgb.early_stopping(stopping_rounds=5), lgb.record_evaluation(evals_result)]
    )
model.save_model('lgbm_' + model_name + '.txt')
eval_df = pl.DataFrame({'train': evals_result['train']['rmse'], 'val': evals_result['val']['rmse']})
eval_df.write_csv('eval_'+ model_name + '.csv')

test_data = pl.read_parquet(Path(data_path, "train.parquet", f"partition_id={test_raw_data_num}", "part-0.parquet"))
test_data_subset = test_data.select([col for col in test_data.columns if col in train_feature_list])
test_data.estimated_size("gb")
test_data = test_data.sort(['symbol_id', 'date_id', 'time_id'])

# load saved model to make predictions
model = lgb.Booster(model_file='lgbm_' + model_name + '.txt')
y_pred = model.predict(test_data_subset)
score = sample_weighted_r2(y_pred, test_data.select(pl.col("responder_6")).to_numpy()[:,0],test_data.select(pl.col("weight")).to_numpy()[:,0])
print(score)
print(y_pred)

Size of training data (GB): 1.8544514616951346




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.339748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19610
[LightGBM] [Info] Number of data points in the train set: 4278560, number of used features: 81
[LightGBM] [Info] Start training from score -0.006785
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10]	train's rmse: 0.845973	val's rmse: 0.846929
Size of training data (GB): 2.149294967763126




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.614510 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19617
[LightGBM] [Info] Number of data points in the train set: 4963129, number of used features: 81
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[20]	train's rmse: 0.818328	val's rmse: 1.06698
Size of training data (GB): 2.1955207837745547
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.709914 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19615
[LightGBM] [Info] Number of data points in the train set: 5068448, number of used features: 81
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[30]	train's rmse: 0.78408	val's rmse: 0.820134
Size of training data (GB): 2.126415732316673
[LightGBM] 