# Load data

In [1]:
import pandas as pd

df_dataset = pd.read_csv("preprocessed_dataset.csv",index_col=0).drop(columns=["5d_pct_price_var","10d_pct_price_var"])

# Train - test separation

In [2]:
df_train = df_dataset.iloc[:-900]
df_test = df_dataset.iloc[-900:]

x_train = df_train.drop(columns=["1d_pct_price_var","observation_date"])
y_train = df_train["1d_pct_price_var"]
x_test = df_test.drop(columns=["1d_pct_price_var","observation_date"])
y_test = df_test["1d_pct_price_var"]

# Model Hyperparameters Selection (sul mio pc ci mette 10 minuti)

In [3]:
# !pip install lightgbm
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


estimator = LGBMRegressor(verbose=-1)
param_grid = {
    # "n_estimators": [5, 10, 50,100,500],
    "n_estimators": [5,100,500],
    "max_depth": [5,7,10],
    # "min_child_samples": [5, 10, 20, 50], 
    "min_child_samples": [40, 50, 100], 
    "learning_rate": [0.01, 0.02,0.05,0.2],
    "lambda_l2":[0, 1, 2, 3, 5, 10]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    # scoring='neg_mean_squared_error',
    cv=4,
    verbose=1
)

grid_search.fit(x_train, y_train)

print("LGBM")
print("Best score:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)


Fitting 4 folds for each of 648 candidates, totalling 2592 fits
LGBM
Best score: -0.0014535073539879984
Best parameters: {'lambda_l2': 10, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_samples': 50, 'n_estimators': 5}


In [9]:
grid_search.score(x_test,y_test)

-0.00930950219782245

In [10]:
import numpy as np

def sign_accuracy(y_true, y_pred):
    return np.mean(np.sign(y_true) == np.sign(y_pred))

train_sign_accuracy = sign_accuracy(y_train,grid_search.predict(x_train))
test_sign_accuracy = sign_accuracy(y_test,grid_search.predict(x_test))
print("Train dataset performance: ",train_sign_accuracy)
print("Test dataset performance: ",test_sign_accuracy)

Train dataset performance:  0.6345833333333334
Test dataset performance:  0.49666666666666665


In [6]:
def sign_accuracy(y_true, y_pred):
    return np.mean(np.sign(y_true) == np.sign(y_pred))

# 1. Get numeric predictions (already continuous from LightGBM)
train_preds = grid_search.predict(x_train)
test_preds = grid_search.predict(x_test)

# 2. Calculate 90th percentile threshold of abs(preds) on train set
train_abs_preds = np.abs(train_preds)
threshold = np.percentile(train_abs_preds, 90)

# 3. Mask for predictions with abs value above threshold
train_mask = train_abs_preds > threshold
test_mask = np.abs(test_preds) > threshold

# 4. Calculate sign accuracy for only the confident predictions
train_sign_accuracy = sign_accuracy(y_train[train_mask], train_preds[train_mask])
test_sign_accuracy = sign_accuracy(y_test[test_mask], test_preds[test_mask])

print("Train Sign Accuracy (Top 10% most confident preds):", train_sign_accuracy)
print("Test Sign Accuracy (Top 10% most confident preds):", test_sign_accuracy)


Train Sign Accuracy (Top 10% most confident preds): 0.7125
Test Sign Accuracy (Top 10% most confident preds): 0.5643564356435643


In [7]:
from scipy.stats import binom_test

# Example inputs
n_total = 900        # Total number of samples (adjust to your case)
n_correct = int(test_sign_accuracy * n_total)  # Number of correct sign predictions (52% of total)

# Perform one-sided binomial test (greater than 50%)
p_value = binom_test(n_correct, n_total, p=0.5, alternative='greater')

print(f"Sign Accuracy: {n_correct / n_total:.2%}")
print(f"P-value: {p_value:.5f}")
    

Sign Accuracy: 56.33%
P-value: 0.00008


  p_value = binom_test(n_correct, n_total, p=0.5, alternative='greater')


In [8]:
# params = {
#     "n_estimators": 10,
#     "max_depth": -1,
#     "min_child_samples": 10,
#     "learning_rate": 0.02,
#     "lambda_l2": 1
# }

# # Model for 5th percentile (Lower bound)
# model_5th = LGBMRegressor(
#     verbose=-1,
#     objective='quantile',
#     alpha=0.05,
#     **params
# )

# # Model for 95th percentile (Upper bound)
# model_95th = LGBMRegressor(
#     verbose=-1,
#     objective='quantile',
#     alpha=0.95,
#     **params
# )

# # Fit both models
# model_5th.fit(x_train, y_train)
# model_95th.fit(x_train, y_train)

# # Predict on test data (or any new data like x_test)
# pred_5th = model_5th.predict(x_test)
# pred_95th = model_95th.predict(x_test)

# # Example: show first 5 predictions
# print("5th percentile predictions:", pred_5th[:5])
# print("95th percentile predictions:", pred_95th[:5])