# Creating the Model for our Predictions

In [1]:
#For our Predictions we'll be Using a LightGBM

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Importing necessary libraries

In [3]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split as TTS, GridSearchCV as GCV, TimeSeriesSplit as TSS
from sklearn.preprocessing import StandardScaler as SS
from sklearn.metrics import mean_absolute_error as MAE

from datetime import datetime
import time
import ntplib

import sys
sys.path.append("../scripts/")

import path

# Importing Features Data

In [4]:
FeaturesTargets = pd.read_parquet(path.FEATURES_DATA_DIR / "BTC-USD_FeaturesData_From2021-12-14 00:00:00+00:00_To2024-01-09 23:00:00+00:00.parquet")
FeaturesTargets

Unnamed: 0,Close_336_Hours_Ago,Close_335_Hours_Ago,Close_334_Hours_Ago,Close_333_Hours_Ago,Close_332_Hours_Ago,Close_331_Hours_Ago,Close_330_Hours_Ago,Close_329_Hours_Ago,Close_328_Hours_Ago,Close_327_Hours_Ago,...,Close_8_Hours_Ago,Close_7_Hours_Ago,Close_6_Hours_Ago,Close_5_Hours_Ago,Close_4_Hours_Ago,Close_3_Hours_Ago,Close_2_Hours_Ago,Close_1_Hours_Ago,ActualClose,ActualDate
0,47022.75,46889.47,47052.39,46977.81,47017.01,46709.76,46689.60,47009.70,47478.33,46847.85,...,51492.41,51886.65,51714.12,51218.06,51258.94,50991.45,51039.92,50717.77,50369.39,2021-12-28 00:00:00+00:00
1,46889.47,47052.39,46977.81,47017.01,46709.76,46689.60,47009.70,47478.33,46847.85,47365.26,...,51886.65,51714.12,51218.06,51258.94,50991.45,51039.92,50717.77,50369.39,49838.69,2021-12-28 01:00:00+00:00
2,47052.39,46977.81,47017.01,46709.76,46689.60,47009.70,47478.33,46847.85,47365.26,47628.67,...,51714.12,51218.06,51258.94,50991.45,51039.92,50717.77,50369.39,49838.69,49794.92,2021-12-28 02:00:00+00:00
3,46977.81,47017.01,46709.76,46689.60,47009.70,47478.33,46847.85,47365.26,47628.67,47526.00,...,51218.06,51258.94,50991.45,51039.92,50717.77,50369.39,49838.69,49794.92,49843.51,2021-12-28 03:00:00+00:00
4,47017.01,46709.76,46689.60,47009.70,47478.33,46847.85,47365.26,47628.67,47526.00,47180.14,...,51258.94,50991.45,51039.92,50717.77,50369.39,49838.69,49794.92,49843.51,49077.57,2021-12-28 04:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17827,42099.94,42122.45,42337.53,42471.06,42515.53,42355.70,42459.92,42232.84,42242.60,42297.37,...,46643.78,46193.87,46858.17,46574.36,46876.36,46921.73,46738.77,46657.29,46890.10,2024-01-09 19:00:00+00:00
17828,42122.45,42337.53,42471.06,42515.53,42355.70,42459.92,42232.84,42242.60,42297.37,42454.83,...,46193.87,46858.17,46574.36,46876.36,46921.73,46738.77,46657.29,46890.10,46651.31,2024-01-09 20:00:00+00:00
17829,42337.53,42471.06,42515.53,42355.70,42459.92,42232.84,42242.60,42297.37,42454.83,42425.66,...,46858.17,46574.36,46876.36,46921.73,46738.77,46657.29,46890.10,46651.31,45419.45,2024-01-09 21:00:00+00:00
17830,42471.06,42515.53,42355.70,42459.92,42232.84,42242.60,42297.37,42454.83,42425.66,42467.26,...,46574.36,46876.36,46921.73,46738.77,46657.29,46890.10,46651.31,45419.45,46278.06,2024-01-09 22:00:00+00:00


# Splitting the Data

In [5]:
Features = np.array(FeaturesTargets.drop(["ActualClose", "ActualDate"], axis=1, inplace=False))
Targets = np.array(FeaturesTargets["ActualClose"])

xTrain, xTest, yTrain, yTest = TTS(Features, Targets, test_size=0.2)

print(f"{xTrain.shape = }")
print(f"{yTrain.shape = }")
print(f"{xTest.shape = }")
print(f"{yTest.shape = }")

xTrain.shape = (14265, 336)
yTrain.shape = (14265,)
xTest.shape = (3567, 336)
yTest.shape = (3567,)


# Scaling the Data

In [6]:
Scaler = SS()

ScaledxTrain = Scaler.fit_transform(xTrain)

# Creating the Model and searching for Best Parameters

In [7]:
import optuna

#Given a Set of Hyperparameters it Trains a Model and Computes an Avg Validation Error Based on TimeSeriesSplit

def Objective(T:optuna.trial.Trial) -> float:
    
    Hyperparams = {"metric":"mae",
                   "verbose":-1,
                   "num_leaves":T.suggest_int("num_leaves", 2, 256),
                   "feature_fraction":T.suggest_float("feature_fraction", 0.2, 1.0),
                   "bagging_fraction":T.suggest_float("bagging_fraction", 0.2, 1.0),
                   "min_child_samples":T.suggest_int("min_child_samples", 3, 100),
                  }
    
    tss = TSS(n_splits=2)
    Scores = []
    
    for trainIndex, valIndex in tss.split(ScaledxTrain):
        
        #Split Data for Training and Validation
        xTrain_, xVal_ = ScaledxTrain[trainIndex, :], ScaledxTrain[valIndex, :]
        yTrain_, yVal_ = yTrain[trainIndex], yTrain[valIndex]
        
        #Train the Model
        LGB = lgb.LGBMRegressor(**Hyperparams)
        LGB.fit(xTrain_, yTrain_)
        
        #Evaluate the Model
        yPred = LGB.predict(xVal_)
        mae = MAE(yVal_, yPred)
        
        Scores.append(mae)
        
    #Return Avg Score
    return np.array(Scores).mean()

In [8]:
Study = optuna.create_study(direction="minimize")
Study.optimize(Objective, n_trials=5)

[I 2024-01-16 14:14:37,618] A new study created in memory with name: no-name-2d33cc33-b2f2-4119-ab23-cb6bf19e0ee0
[I 2024-01-16 14:14:59,626] Trial 0 finished with value: 123.04068849142773 and parameters: {'num_leaves': 239, 'feature_fraction': 0.8052265999042525, 'bagging_fraction': 0.33311487892502545, 'min_child_samples': 86}. Best is trial 0 with value: 123.04068849142773.
[I 2024-01-16 14:15:12,849] Trial 1 finished with value: 120.17991878429811 and parameters: {'num_leaves': 153, 'feature_fraction': 0.4550515204765634, 'bagging_fraction': 0.6417867018947867, 'min_child_samples': 56}. Best is trial 1 with value: 120.17991878429811.
[I 2024-01-16 14:15:21,060] Trial 2 finished with value: 123.81168723881595 and parameters: {'num_leaves': 78, 'feature_fraction': 0.49111933548545494, 'bagging_fraction': 0.850438174614248, 'min_child_samples': 90}. Best is trial 1 with value: 120.17991878429811.
[I 2024-01-16 14:15:24,563] Trial 3 finished with value: 138.6885165283109 and parameter

In [9]:
BestParams = Study.best_trial.params
BestParams

{'num_leaves': 153,
 'feature_fraction': 0.4550515204765634,
 'bagging_fraction': 0.6417867018947867,
 'min_child_samples': 56}

In [10]:
BestModel = lgb.LGBMRegressor(**BestParams)
BestModel.fit(ScaledxTrain, yTrain)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098867 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85680
[LightGBM] [Info] Number of data points in the train set: 14265, number of used features: 336
[LightGBM] [Info] Start training from score 28828.090965


In [11]:
Preds = BestModel.predict(Scaler.transform(xTest))
testMae = MAE(yTest, Preds)
print(f"{testMae = :.4f}")

testMae = 109.8613


# Dumping Model and Scaler to Disk

In [12]:
import joblib
joblib.dump(BestModel, path.MODEL_DIR / "Model.pkl")
joblib.dump(Scaler, path.MODEL_DIR / "Scaler.pkl")

['/home/Zero/Scrivania/btcpricepredictionvenv/model/Scaler.pkl']

In [13]:
# NOT ENOUGH LOCAL COMPUTING POWER TO SUPPORT THIS

'''
#Initialize LGBMRegressor estimattor
LGB = lgb.LGBMRegressor(objective='regression')


param_grid = {
         'verbose': [-1],
         'num_leaves': [x for x in range(64, 257, 12)],
         'feature_fraction': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
         'bagging_fraction': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
         'min_child_samples': [x for x in range(3, 101, 4)]
    }

#Initialize Grid Search with 3-fold cross validation 
Model = GCV(estimator=LGB, 
                     param_grid=param_grid,
                     cv=3, 
                     n_jobs=-1, 
                     scoring='neg_mean_absolute_error',
                     verbose=10)

Model.fit(ScaledxTrain, yTrain)
'''

"\n#Initialize LGBMRegressor estimattor\nLGB = lgb.LGBMRegressor(objective='regression')\n\n\nparam_grid = {\n         'verbose': [-1],\n         'num_leaves': [x for x in range(64, 257, 12)],\n         'feature_fraction': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],\n         'bagging_fraction': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],\n         'min_child_samples': [x for x in range(3, 101, 4)]\n    }\n\n#Initialize Grid Search with 3-fold cross validation \nModel = GCV(estimator=LGB, \n                     param_grid=param_grid,\n                     cv=3, \n                     n_jobs=-1, \n                     scoring='neg_mean_absolute_error',\n                     verbose=10)\n\nModel.fit(ScaledxTrain, yTrain)\n"