# Load data

In [2]:
import pandas as pd

df_dataset = pd.read_csv("preprocessed_dataset.csv",index_col=0).drop(columns=["5d_pct_price_var","10d_pct_price_var"])

# Train - test separation

In [3]:
df_train = df_dataset.iloc[:-900]
df_test = df_dataset.iloc[-900:]

x_train = df_train.drop(columns=["1d_pct_price_var","observation_date"])
y_train = df_train["1d_pct_price_var"]
x_test = df_test.drop(columns=["1d_pct_price_var","observation_date"])
y_test = df_test["1d_pct_price_var"]

# Model Hyperparameters Selection

In [4]:
# # !pip install lightgbm
# from lightgbm import LGBMRegressor
# from sklearn.model_selection import GridSearchCV
# import warnings
# warnings.filterwarnings('ignore', category=FutureWarning)


# estimator = LGBMRegressor(verbose=-1,device='gpu')
# param_grid = {
#     "n_estimators": [5,100,500,1000],
#     "max_depth": [5,7,10],
#     "max_bin":[50,80,100],
#     "num_leaves":[10,20,30],
#     "baggin_freq":[1,2,5],
#     "bagging_fraction":[0.2,0.5,0.8],
#     "feature_fraction":[0.2,0.5,0.8],
#     # "min_child_samples": [40, 50, 100], 
#     "learning_rate": [0.01, 0.02,0.05,0.2],
#     "lambda_l2":[0, 1, 2, 3, 5, 10]
# }

# grid_search = GridSearchCV(
#     estimator=estimator,
#     param_grid=param_grid,
#     scoring='r2',
#     # scoring='neg_mean_squared_error',
#     cv=4,
#     verbose=1
# )

# grid_search.fit(x_train, y_train)

# print("LGBM")
# print("Best score:", grid_search.best_score_)
# print("Best parameters:", grid_search.best_params_)


In [None]:
# !pip install optuna lightgbm

import optuna
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

def objective(trial):
    params = {
            "n_estimators": trial.suggest_int("n_estimators", 5, 500, step=50),  # Integer steps
            "max_depth": trial.suggest_int("max_depth", 5, 10),  # Integer range
            "max_bin": trial.suggest_int("max_bin", 50, 100, step=10),  # Integer step
            "num_leaves": trial.suggest_int("num_leaves", 10, 35, step=5),  # Integer step
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),  # Integer range
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.9),  # Continuous
            "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.9),  # Continuous
            "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),  # Integer range
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 10),  # Integer range
            "device": "gpu",  # Always use GPU
            "verbose": -1
            }
    # params = {
    #     "n_estimators": trial.suggest_categorical("n_estimators", [5, 100, 500]),
    #     "max_depth": trial.suggest_categorical("max_depth", [5, 7, 10]),
    #     "max_bin": trial.suggest_categorical("max_bin", [50, 80, 100]),
    #     "num_leaves": trial.suggest_categorical("num_leaves", [10, 20, 30]),
    #     "bagging_freq": trial.suggest_categorical("bagging_freq", [1, 2, 5]),
    #     "bagging_fraction": trial.suggest_categorical("bagging_fraction", [0.2, 0.5, 0.8]),
    #     "feature_fraction": trial.suggest_categorical("feature_fraction", [0.2, 0.5, 0.8]),
    #     "min_child_samples": trial.suggest_categorical("min_child_samples", [40, 50, 100]),
    #     "learning_rate": trial.suggest_categorical("learning_rate", [0.02, 0.05, 0.2]),
    #     "lambda_l2": trial.suggest_categorical("lambda_l2", [0, 3, 7, 10]),
    #     "device": "gpu",  # Use GPU
    #     "verbose": -1
    # }
    
    model = LGBMRegressor(**params)
    model.fit(x_train,y_train)
    # # Use 4-fold CV with R² scoring
    # scores = cross_val_score(model, x_train, y_train, cv=4, scoring='r2', n_jobs=-1)
    # return np.mean(scores)
    score = np.mean(np.sign(model.predict(x_train)) == np.sign(y_train))
    
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20, show_progress_bar=True)

print("\nLGBM Optuna Results")
print("Best CV R2 score:", study.best_value)
print("Best hyperparameters:", study.best_params)


[I 2025-07-02 16:49:33,700] A new study created in memory with name: no-name-a05d4464-a6e8-44ce-8138-38aaada75556


  0%|          | 0/20 [00:00<?, ?it/s]



[I 2025-07-02 16:56:38,235] Trial 0 finished with value: 0.6654166666666667 and parameters: {'n_estimators': 55, 'max_depth': 8, 'max_bin': 60, 'num_leaves': 20, 'bagging_freq': 5, 'bagging_fraction': 0.6233993911375824, 'feature_fraction': 0.28940831420336854, 'min_child_samples': 90, 'learning_rate': 0.06998629355893156, 'lambda_l2': 4}. Best is trial 0 with value: 0.6654166666666667.




[I 2025-07-02 17:28:56,851] Trial 1 finished with value: 0.7920833333333334 and parameters: {'n_estimators': 305, 'max_depth': 8, 'max_bin': 70, 'num_leaves': 15, 'bagging_freq': 1, 'bagging_fraction': 0.602702235314461, 'feature_fraction': 0.219872369519919, 'min_child_samples': 56, 'learning_rate': 0.0576769493759758, 'lambda_l2': 7}. Best is trial 1 with value: 0.7920833333333334.




In [6]:
from lightgbm import LGBMRegressor

params =  {'n_estimators': 305, 
           'max_depth': 8, 
           'max_bin': 70, 
           'num_leaves': 15, 
           'bagging_freq': 1, 
           'bagging_fraction': 0.602702235314461, 
           'feature_fraction': 0.219872369519919, 
           'min_child_samples': 56, 
           'learning_rate': 0.0576769493759758, 
           'lambda_l2': 7,
           'device':'gpu',
           'verbose': -1}

lgbm_reg = LGBMRegressor(**params)
lgbm_reg.fit(x_train,y_train)



In [7]:
import numpy as np

def sign_accuracy(y_true, y_pred):
    return np.mean(np.sign(y_true) == np.sign(y_pred))

train_sign_accuracy = sign_accuracy(y_train,lgbm_reg.predict(x_train))
test_sign_accuracy = sign_accuracy(y_test,lgbm_reg.predict(x_test))
# train_sign_accuracy = sign_accuracy(y_train,grid_search.predict(x_train))
# test_sign_accuracy = sign_accuracy(y_test,grid_search.predict(x_test))
print("Train dataset performance: ",train_sign_accuracy)
print("Test dataset performance: ",test_sign_accuracy)

Train dataset performance:  0.7995833333333333
Test dataset performance:  0.46555555555555556


In [8]:
def sign_accuracy(y_true, y_pred):
    return np.mean(np.sign(y_true) == np.sign(y_pred))

# 1. Get numeric predictions (already continuous from LightGBM)
train_preds = grid_search.predict(x_train)
test_preds = grid_search.predict(x_test)

# 2. Calculate 90th percentile threshold of abs(preds) on train set
train_abs_preds = np.abs(train_preds)
threshold = np.percentile(train_abs_preds, 90)

# 3. Mask for predictions with abs value above threshold
train_mask = train_abs_preds > threshold
test_mask = np.abs(test_preds) > threshold

# 4. Calculate sign accuracy for only the confident predictions
train_sign_accuracy = sign_accuracy(y_train[train_mask], train_preds[train_mask])
test_sign_accuracy = sign_accuracy(y_test[test_mask], test_preds[test_mask])

print("Train Sign Accuracy (Top 10% most confident preds):", train_sign_accuracy)
print("Test Sign Accuracy (Top 10% most confident preds):", test_sign_accuracy)


NameError: name 'grid_search' is not defined

In [None]:
from scipy.stats import binom_test

# Example inputs
n_total = 900        # Total number of samples (adjust to your case)
n_correct = int(test_sign_accuracy * n_total)  # Number of correct sign predictions (52% of total)

# Perform one-sided binomial test (greater than 50%)
p_value = binom_test(n_correct, n_total, p=0.5, alternative='greater')

print(f"Sign Accuracy: {n_correct / n_total:.2%}")
print(f"P-value: {p_value:.5f}")
    

In [None]:
# params = {
#     "n_estimators": 10,
#     "max_depth": -1,
#     "min_child_samples": 10,
#     "learning_rate": 0.02,
#     "lambda_l2": 1
# }

# # Model for 5th percentile (Lower bound)
# model_5th = LGBMRegressor(
#     verbose=-1,
#     objective='quantile',
#     alpha=0.05,
#     **params
# )

# # Model for 95th percentile (Upper bound)
# model_95th = LGBMRegressor(
#     verbose=-1,
#     objective='quantile',
#     alpha=0.95,
#     **params
# )

# # Fit both models
# model_5th.fit(x_train, y_train)
# model_95th.fit(x_train, y_train)

# # Predict on test data (or any new data like x_test)
# pred_5th = model_5th.predict(x_test)
# pred_95th = model_95th.predict(x_test)

# # Example: show first 5 predictions
# print("5th percentile predictions:", pred_5th[:5])
# print("95th percentile predictions:", pred_95th[:5])