In [None]:
"""
Random Forest, Gradient Boosting and Histogram-Based Gradient Boosting mdoels using scikit learn library
"""

from sklearn.model_selection import train_test_split
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
from random import randint

In [None]:
#Set the candle stick time
kline_time = "5m"
label_name = "Trend_1"
model_name = "gb"    #Must be rf, gb or hgb

In [None]:
#Read the dataset
ds = pd.read_csv(f"../data/Data_{kline_time}_Ind.csv",  delimiter=',', low_memory=True)
ds.describe()

In [None]:
#Drop some columns to ensure better model scoring
col_to_drop = ['OpenTime', 'Diff_1', 'qAssetVol', 'TbuybAssetVol', 'TbuyqAssetVol', 'Ignore']
ds = ds.drop(labels=col_to_drop, axis=1)
ds.head()

In [None]:
#Split the dataframe
Y = ds[label_name].values

X = ds.drop(labels= [label_name], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4)

In [None]:
#Set the model

if model_name == "rf":
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100, verbose=1, n_jobs=8)
    model.fit(X_train, Y_train)

elif model_name == "gb":
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=100, verbose=1)
    model.fit(X_train, Y_train)

elif model_name == "hgb":
    from sklearn.ensemble import HistGradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=100, verbose=1)
    model.fit(X_train, Y_train)

else:
    print("The model must be rf, gb or hgb!")

In [None]:
#Make predictions
pred = model.predict(X_test)

In [None]:
pred

In [None]:
#Calculate the scores
from sklearn import metrics

print("Accuracy: ", metrics.accuracy_score(Y_test, pred))
print("Precision: ", metrics.precision_score(Y_test, pred))
print("Recall: ", metrics.recall_score(Y_test, pred))

In [None]:
#Plot the results

fpr, tpr, thresholds = metrics.roc_curve(Y_test, pred, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="Random Forest")
display.plot()

In [None]:
#Display the confusion matrix
cm = metrics.confusion_matrix(Y_test, pred)
metrics.ConfusionMatrixDisplay(confusion_matrix=cm).plot()

In [None]:
#Print the importance deatures
from sklearn.inspection import permutation_importance

fimp = permutation_importance(model, X_test, Y_test, n_repeats=10)

#Make a dataframe of importances

data = {'Indicators' : X.columns.values, 'Importances': fimp.importances_mean}
importances = pd.DataFrame(data)

importances.sort_values(['Importances'])

In [None]:
#Make some predictions in pseudo random data from dataframe
list_of_idx = []

for _ in range(10):
    list_of_idx.append(randint(0, len(ds)))

for i in list_of_idx:

    print("The index of data:", i)
    pred_one_value = ds.iloc[i]
    print("Original value: ", pred_one_value[label_name])
    pred_one_value = pred_one_value.drop(labels=[label_name])
    prediction = np.array(pred_one_value.values)
    prediction = prediction.reshape(1, -1)
    one_pred = model.predict(prediction)
    print("Prediction: ", one_pred)

In [None]:
#Save the model using pickle
import pickle

with open(f"saved-models/{model_name}_{kline_time}_{label_name}_acc{round(metrics.accuracy_score(Y_test, pred), 2)}.pkl", "wb") as file:
    pickle.dump(model, file)
