# Load data

In [21]:
import pandas as pd

df_dataset = pd.read_csv("preprocessed_dataset.csv",index_col=0).drop(columns=["1d_pct_price_var","5d_pct_price_var"])

# Train - test separation

In [22]:
df_train = df_dataset.iloc[:-900]
df_test = df_dataset.iloc[-900:]

x_train = df_train.drop(columns=["10d_pct_price_var","observation_date"])
y_train = df_train["10d_pct_price_var"]
x_test = df_test.drop(columns=["10d_pct_price_var","observation_date"])
y_test = df_test["10d_pct_price_var"]

# Model Hyperparameters Selection 

In [23]:
# !pip install lightgbm
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


estimator = LGBMRegressor(verbose=-1)
param_grid = {
    # "n_estimators": [5, 10, 50,100,500],
    "n_estimators": [5,100,500],
    "max_depth": [-1,5,10],
    # "min_child_samples": [5, 10, 20, 50], 
    "min_child_samples": [30,40,50,100,500], 
    "learning_rate": [0.01, 0.02,0.05,0.2],
    "lambda_l2":[0, 1, 2, 3]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=4,
    verbose=1
)

grid_search.fit(x_train, y_train)

print("LGBM")
print("Best score:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)


Fitting 4 folds for each of 720 candidates, totalling 2880 fits
LGBM
Best score: -0.022985099737245274
Best parameters: {'lambda_l2': 0, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_samples': 100, 'n_estimators': 5}


In [24]:
grid_search.score(x_test,y_test)

-0.028405905155195033

In [27]:
import numpy as np

def sign_accuracy(y_true, y_pred):
    return np.mean(np.sign(y_true) == np.sign(y_pred))

train_sign_accuracy = sign_accuracy(y_train,grid_search.predict(x_train))
test_sign_accuracy = sign_accuracy(y_test,grid_search.predict(x_test))
print("Train dataset performance: ",train_sign_accuracy)
print("Test dataset performance: ",test_sign_accuracy)

Train dataset performance:  0.6333333333333333
Test dataset performance:  0.5855555555555556


In [26]:
from scipy.stats import binom_test

# Example inputs
n_total = 900        # Total number of samples (adjust to your case)
n_correct = int(test_sign_accuracy * n_total)  # Number of correct sign predictions (52% of total)

# Perform one-sided binomial test (greater than 50%)
p_value = binom_test(n_correct, n_total, p=0.5, alternative='greater')

print(f"Sign Accuracy: {n_correct / n_total:.2%}")
print(f"P-value: {p_value:.5f}")

Sign Accuracy: 58.56%
P-value: 0.00000


  p_value = binom_test(n_correct, n_total, p=0.5, alternative='greater')
