# Load data

In [1]:
import pandas as pd

df_dataset = pd.read_csv("preprocessed_dataset.csv",index_col=0).drop(columns=["1d_pct_price_var","5d_pct_price_var"])

# Train - test separation and target variable discretization

In [2]:
import plotly.express as px

fig = px.histogram(df_dataset, x="10d_pct_price_var", nbins=50)
fig.update_layout(
    title="Distribution of 10-Days % Price Variation",
    xaxis_title="10d_pct_price_var",
    yaxis_title="Count",
    bargap=0.1
)
fig.show()


In [3]:
def var_class(x, alpha=0.01):
    if x < - alpha:
        return 0
    elif x > alpha:
        return 1
    else:
        return 2
df_dataset["target"] = df_dataset["10d_pct_price_var"].apply(var_class)

df_dataset.target.value_counts()

target
1    1567
0    1340
2     391
Name: count, dtype: int64

In [4]:
df_train = df_dataset.iloc[:-900]
df_test = df_dataset.iloc[-900:]

x_train = df_train.drop(columns=["target","observation_date"])
y_train = df_train["target"]
x_test = df_test.drop(columns=["target","observation_date"])
y_test = df_test["target"]

# Model Hyperparameters Selection

In [5]:
# from lightgbm import LGBMClassifier
# import optuna
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# import numpy as np
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning)


# from sklearn.metrics import make_scorer, accuracy_score

# sign_accuracy = make_scorer(accuracy_score)

# def objective(trial):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
#         "max_depth": trial.suggest_int("max_depth", 5, 15),
#         "num_leaves": trial.suggest_int("num_leaves", 10, 50, step=5),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
#         "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
#         "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
#         "lambda_l2": trial.suggest_float("lambda_l2", 0, 10),
#         "objective": "multiclass",
#         "num_class": 3,
#         "random_state": 0,
#         "device": "gpu",  
#         "verbose": -1
#     }

#     model = LGBMClassifier(**params)

#     cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
#     scores = cross_val_score(model, x_train, y_train, cv=cv, scoring=sign_accuracy, n_jobs=-1)

#     return np.mean(scores)


# # Run Optuna Study
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=20, show_progress_bar=True)

# print("\nLGBM Optuna Results")
# print("Best Sign Accuracy score (CV):", study.best_value)
# print("Best hyperparameters:", study.best_params)

In [6]:
from lightgbm import LGBMClassifier

params =  {'n_estimators': 300, 
           'max_depth': 11, 
           'num_leaves': 15, 
           'learning_rate': 0.026920262461271487, 
           'feature_fraction': 0.7163381486430572, 
           'bagging_fraction': 0.7761830653355526, 
           'bagging_freq': 5, 
           'min_child_samples': 85, 
           'lambda_l2': 6.215847075871736,
           'device':'gpu',
           'verbose': -1,
           'random_state': 0
           }
# lgbm_reg = LGBMClassifier(**study.best_params, device="gpu", verbose=-1, random_state=0)
lgbm_class = LGBMClassifier(**params)
lgbm_class.fit(x_train,y_train)


The LGBMClassifier or classes from which it inherits use `_get_tags` and `_more_tags`. Please define the `__sklearn_tags__` method, or inherit from `sklearn.base.BaseEstimator` and/or other appropriate mixins such as `sklearn.base.TransformerMixin`, `sklearn.base.ClassifierMixin`, `sklearn.base.RegressorMixin`, and `sklearn.base.OutlierMixin`. From scikit-learn 1.7, not defining `__sklearn_tags__` will raise an error.



In [7]:
import numpy as np

def sign_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

train_preds = lgbm_class.predict(x_train)
test_preds = lgbm_class.predict(x_test)


train_sign_accuracy = sign_accuracy(y_train,train_preds)
test_sign_accuracy = sign_accuracy(y_test,test_preds)

print("Train dataset performance: ",train_sign_accuracy)
print("Test dataset performance: ",test_sign_accuracy)

Train dataset performance:  0.9995829858215179
Test dataset performance:  0.9977777777777778
