# `Importing libraries and the dataset`

In [1]:
!pip install scikit-plot 
#data manipulation and visualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

#libraries for modelling and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import KFold
import scikitplot as skplt 
import warnings
warnings.filterwarnings('ignore')

Collecting scikit-plot
  Downloading https://files.pythonhosted.org/packages/7c/47/32520e259340c140a4ad27c1b97050dd3254fdc517b1d59974d47037510e/scikit_plot-0.3.7-py3-none-any.whl
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


  import pandas.util.testing as tm


In [0]:
 #importing the data and making a dataframe
data = pd.read_csv('https://raw.githubusercontent.com/IrenOkminyan/BUS288/master/ETH_USDT.csv')
data.dropna(inplace=True)
data=data.drop(['close_10.0_le_5_c'],axis=1)
data=data.drop(['close_10.0_ge_5_fc', 'cci_20', 'wr_6', 'rsi_12', 'trix_9_sma', 'open_2_sma', 'macds', 'boll', 'boll_lb', 'kdjj', 'kdjd', 'rsi_6', 'wr_10', 'cci', 'macd', 'boll_ub', 'adx', 'adxr', 'tema'],axis=1)
data=data.drop(['atr', 'vr', 'vr_6_sma','Open', 'Close', 'High', 'Low', 'volume_-3~1_min', 'kdjk', 'trix', 'cr', 'cr-ma1', 'cr-ma2', 'cr-ma3'], axis = 1)
data=data.drop(['Volume', 'Number_of_trades', 'mdi', 'tr', 'volume_-3,2,-1_max', 'open_2_d'], axis = 1)

data.set_index(keys='Close_time', inplace=True)
data.target=data.target.map({-1:0, 1:1})

data_lr = data
data_gb = data
data_dt = data
data_rf = data

# `Logistic Regression`

In [0]:
#making dummes,train test split
data_lr = pd.get_dummies(data_lr, drop_first=True)

In [0]:
#separating X and Y
Y = data_lr["target"]
X = data_lr.drop("target",axis=1)

In [0]:
#let's split X and Y data into test and train datasets, test data is 25% of overall data
#splitting the data into train and test sets
X0, X1, Y0, Y1 = train_test_split(X, Y, test_size=0.25, shuffle = False)

In [6]:
#building and fitting the model
logit = LogisticRegression(random_state=42)
logit.fit(X0,Y0)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
#let's see the order of classes
logit.classes_

Y0_lr = logit.predict_proba(X0)[:,1]
Y1_lr = logit.predict_proba(X1)[:,1]

In [8]:
#calculating roc auc score using 
Logit_Train = roc_auc_score(Y0,Y0_lr).round(2)
Logit_Test = roc_auc_score(Y1,Y1_lr).round(2)
Logit_Mean5 = np.mean(cross_val_score(estimator=logit,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train:", Logit_Train)
print("ROC_AUC Test:", Logit_Train)
print("Mean 5-fold ROC AUC score for Logit", Logit_Mean5)

ROC_AUC Train: 0.52
ROC_AUC Test: 0.52
Mean 5-fold ROC AUC score for Logit 0.52


In [9]:
# get importance
pd.DataFrame(data=logit.coef_[0],index=X0.columns)

Unnamed: 0,0
volume_delta,-1e-06
open_-2_r,-0.026674
macdh,-0.001517
cr-ma2_xu_cr-ma1_20_c,0.000153
dma,0.001092
pdi,-0.001712
dx,0.001212


# `Tuned Logistic Regression`

In [10]:
#building GridSearch with Logistic Regression
gs_logit = LogisticRegression(random_state=42)

param_logit = {"class_weight":["balanced",None],
            "C":np.linspace(0.0001, 1, 100)}


logit_gs = GridSearchCV(estimator = gs_logit,
                      param_grid = param_logit,
                      scoring = "roc_auc", cv=KFold(n_splits=5, shuffle=False, random_state=None), verbose=1, n_jobs=-1).fit(X0,Y0)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.2min finished


In [11]:
#finding out the best parameters
logit_gs.best_params_

{'C': 0.1011, 'class_weight': 'balanced'}

In [0]:
#building logit with best params and seeing the results
pm = logit_gs.best_params_
logit_grid = LogisticRegression(C=pm['C'],class_weight=pm['class_weight'],random_state=42).fit(X0,Y0)

Y0_lr_gs = logit_grid.predict_proba(X0)[:,1]
Y1_lr_gs = logit_grid.predict_proba(X1)[:,1]


In [13]:
#calculating roc auc score using
Logit_GS_Train = roc_auc_score(Y0,Y0_lr_gs).round(2)
Logit_GS_Test = roc_auc_score(Y1,Y1_lr_gs).round(2)
Logit_GS_Mean5 = np.mean(cross_val_score(estimator=logit_grid,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train for tuned Logit:",Logit_GS_Train)
print("ROC_AUC Test for tuned Logit:",Logit_GS_Test)
print("Mean 5-fold ROC AUC score for Tuned Logit", Logit_GS_Mean5)

ROC_AUC Train for tuned Logit: 0.52
ROC_AUC Test for tuned Logit: 0.54
Mean 5-fold ROC AUC score for Tuned Logit 0.52


In [14]:
# get importance
pd.DataFrame(data=logit_grid.coef_[0],index=X0.columns)

Unnamed: 0,0
volume_delta,-9.412183e-07
open_-2_r,-0.01457162
macdh,-0.01027339
cr-ma2_xu_cr-ma1_20_c,4.132407e-05
dma,0.00115502
pdi,-0.00197511
dx,0.001045904


# `Gradient Boosting`

In [0]:
#making dummes,train test split
data_gb = pd.get_dummies(data_gb, drop_first=True)

In [0]:
#separating X and Y
Y = data_gb["target"]
X = data_gb.drop("target",axis=1)

In [0]:
#let's split X and Y data into test and train datasets, test data is 25% of overall data
#splitting the data into train and test sets
X0, X1, Y0, Y1 = train_test_split(X, Y, test_size=0.25, shuffle = False)

In [18]:
#building and fitting the model
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X0,Y0)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [0]:
#let's see the order of classes
gb.classes_

Y0_gb = gb.predict_proba(X0)[:,1]
Y1_gb = gb.predict_proba(X1)[:,1]

In [20]:
#calculating roc auc score using 
GB_Train = roc_auc_score(Y0,Y0_gb).round(2)
GB_Test = roc_auc_score(Y1,Y1_gb).round(2)
GB_Mean5 = np.mean(cross_val_score(estimator=gb,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train:", GB_Train)
print("ROC_AUC Test:", GB_Train)
print("Mean 5-fold ROC AUC score for GB", GB_Mean5)

ROC_AUC Train: 0.65
ROC_AUC Test: 0.65
Mean 5-fold ROC AUC score for GB 0.52


In [21]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=gb.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.151597
open_-2_r,0.182173
macdh,0.169471
cr-ma2_xu_cr-ma1_20_c,0.0
dma,0.132374
pdi,0.217874
dx,0.146511


# `Tuned Gradient Boosting Classifier`

In [22]:
#building GridSearch with Logistic Regression
gs_gb = GradientBoostingClassifier(random_state=42)

param_gb = { "max_depth":[3,5,8] }


gb_gs = GridSearchCV(estimator = gs_gb,
                      param_grid = param_gb,
                      scoring = "roc_auc", cv=KFold(n_splits=5, shuffle=False, random_state=None), verbose=1, n_jobs=-1).fit(X0,Y0)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   45.2s finished


In [23]:
#finding out the best parameters
gb_gs.best_params_

{'max_depth': 5}

In [0]:
#building logit with best params and seeing the results
pm = gb_gs.best_params_
gb_grid = GradientBoostingClassifier( max_depth = pm['max_depth'],random_state=42).fit(X0,Y0)

Y0_gb_gs = gb_grid.predict_proba(X0)[:,1]
Y1_gb_gs = gb_grid.predict_proba(X1)[:,1]


In [25]:
#calculating roc auc score using
GB_GS_Train = roc_auc_score(Y0,Y0_gb_gs).round(2)
GB_GS_Test = roc_auc_score(Y1,Y1_gb_gs).round(2)
GB_GS_Mean5 = np.mean(cross_val_score(estimator=gb_grid,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train for tuned GB:", GB_GS_Train)
print("ROC_AUC Test for tuned GB:", GB_GS_Test)
print("Mean 5-fold ROC AUC score for Tuned GB", GB_GS_Mean5)

ROC_AUC Train for tuned GB: 0.8
ROC_AUC Test for tuned GB: 0.53
Mean 5-fold ROC AUC score for Tuned GB 0.52


In [26]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=gb_grid.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.173608
open_-2_r,0.158597
macdh,0.161819
cr-ma2_xu_cr-ma1_20_c,0.0
dma,0.150363
pdi,0.186871
dx,0.168742


# `Decision Tree`

In [0]:
data_dt = pd.get_dummies(data_dt,drop_first=True)

Y = data_dt["target"]
X = data_dt.drop("target",axis=1)

X0, X1, Y0, Y1 = train_test_split(X,Y,test_size =0.25, shuffle = False)

In [28]:
#building DT with default hyperparameters
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X0,Y0)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [0]:
#calculating probabilities
Y0_dt=dt.predict_proba(X0)[:,1]
Y1_dt=dt.predict_proba(X1)[:,1]

In [30]:
#printing the ROC AUC scores for training and testing sets
DT_Train = roc_auc_score(Y0,Y0_dt).round(2)
DT_Test = roc_auc_score(Y1,Y1_dt).round(2)
DT_Mean5 = np.mean(cross_val_score(estimator = dt,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1, scoring="roc_auc")).round(2)


print("ROC_AUC Train for fully grown DT:",DT_Train)
print("ROC_AUC Test for fully grown DT:", DT_Test)
print("Mean 5-fold ROC AUC score for Decision Tree", DT_Mean5)

ROC_AUC Train for fully grown DT: 1.0
ROC_AUC Test for fully grown DT: 0.51
Mean 5-fold ROC AUC score for Decision Tree 0.5


In [31]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=dt.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.176328
open_-2_r,0.165603
macdh,0.152637
cr-ma2_xu_cr-ma1_20_c,0.0
dma,0.15833
pdi,0.189439
dx,0.157663


# `Tuned Decision Tree`

In [0]:
#setting up parameters for DT's GridSearch
param_dt={"max_depth":range(1,9),
          "min_samples_leaf":range(10,150,10),
          "class_weight":["balanced",None]        
            }

In [33]:
#fitting GridSearch with above specified parameters
dt_gs = GridSearchCV(estimator = DecisionTreeClassifier(random_state=42), param_grid = param_dt,
                  scoring="roc_auc",cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1)
dt_gs.fit(X0,Y0)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_

In [34]:
 dt_gs.best_params_

{'class_weight': None, 'max_depth': 5, 'min_samples_leaf': 50}

In [0]:
#fitting the model with best parameters and calculating probabilites
pm = dt_gs.best_params_
dt_grid = DecisionTreeClassifier(class_weight = pm['class_weight'], max_depth = pm['max_depth'], min_samples_leaf = pm['min_samples_leaf'], random_state=42).fit(X0,Y0)

Y0_dt_grid=dt_grid.predict_proba(X0)[:,1]
Y1_dt_grid=dt_grid.predict_proba(X1)[:,1]

In [36]:
#printing the ROC AUC scores for training and testing sets
DT_GS_Train = roc_auc_score(Y0,Y0_dt_grid).round(2)
DT_GS_Test = roc_auc_score(Y1,Y1_dt_grid).round(2)
DT_GS_Mean5 = np.mean(cross_val_score(estimator = dt_grid,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)


print("ROC_AUC Train for tuned DT:",DT_GS_Train)
print("ROC_AUC Test for tuned DT:", DT_GS_Test)
print("Mean 5-fold ROC AUC score for Tuned Decision Tree", DT_GS_Mean5)

ROC_AUC Train for tuned DT: 0.56
ROC_AUC Test for tuned DT: 0.53
Mean 5-fold ROC AUC score for Tuned Decision Tree 0.52


In [37]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=dt_grid.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.129583
open_-2_r,0.306292
macdh,0.042347
cr-ma2_xu_cr-ma1_20_c,0.0
dma,0.240666
pdi,0.214474
dx,0.066637


# `Random Forest`

In [0]:
data_rf = pd.get_dummies(data_rf,drop_first=True)

Y = data_rf["target"]
X = data_rf.drop("target",axis=1)

X0, X1, Y0, Y1 = train_test_split(X,Y,test_size =0.25, shuffle = False)

In [39]:
#building DT with default hyperparameters
rf = RandomForestClassifier(random_state=42)
rf.fit(X0,Y0)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [0]:
#calculating probabilities
Y0_rf = rf.predict_proba(X0)[:,1]
Y1_rf = rf.predict_proba(X1)[:,1]

In [41]:
#printing the ROC AUC scores for training and testing sets
RF_Train = roc_auc_score(Y0,Y0_rf).round(2)
RF_Test = roc_auc_score(Y1,Y1_rf).round(2)
RF_Mean5 = np.mean(cross_val_score(estimator = rf,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train for RF:",RF_Train)
print("ROC_AUC Test for RF:", RF_Test)
print("Mean 5-fold ROC AUC score for RF", RF_Mean5)

ROC_AUC Train for RF: 1.0
ROC_AUC Test for RF: 0.52
Mean 5-fold ROC AUC score for RF 0.51


In [42]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=rf.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.167046
open_-2_r,0.16772
macdh,0.165336
cr-ma2_xu_cr-ma1_20_c,8e-06
dma,0.165027
pdi,0.167834
dx,0.16703


# `Tuned Random Forest`

In [43]:
print(param_dt)

{'max_depth': range(1, 9), 'min_samples_leaf': range(10, 150, 10), 'class_weight': ['balanced', None]}


In [0]:
#building GridSearch for RF using DT's hyperparameters ranges
gs_rf = GridSearchCV(estimator = RandomForestClassifier(random_state=42),
                  param_grid = param_dt, scoring="roc_auc", cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1).fit(X0,Y0)

In [45]:
 #outlining best parameters
gs_rf.best_params_

{'class_weight': None, 'max_depth': 5, 'min_samples_leaf': 140}

In [0]:
#fitting the model and calculating probabilities
pm = gs_rf.best_params_
rf_grid = RandomForestClassifier(class_weight = pm['class_weight'], max_depth = pm['max_depth'], min_samples_leaf = pm['min_samples_leaf'], random_state=42).fit(X0,Y0)

Y0_rf_grid=rf_grid.predict_proba(X0)[:,1]
Y1_rf_grid=rf_grid.predict_proba(X1)[:,1]

In [47]:
#printing the ROC AUC scores for training and testing sets
RF_GS_Train = roc_auc_score(Y0,Y0_rf_grid).round(2)
RF_GS_Test = roc_auc_score(Y1,Y1_rf_grid).round(2)
RF_GS_Mean5 = np.mean(cross_val_score(estimator = rf_grid,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)


print("ROC_AUC Train for tuned RF:",RF_GS_Train)
print("ROC_AUC Test for tuned RF:", RF_GS_Test)
print("Mean 5-fold ROC AUC score for tuned RF", RF_GS_Mean5)

ROC_AUC Train for tuned RF: 0.59
ROC_AUC Test for tuned RF: 0.55
Mean 5-fold ROC AUC score for tuned RF 0.53


In [48]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=rf_grid.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.145147
open_-2_r,0.197188
macdh,0.143481
cr-ma2_xu_cr-ma1_20_c,0.0
dma,0.158925
pdi,0.202395
dx,0.152864


# `Summary`

In [49]:
#Picking the best model based on mean 5-fold cross validation score.

print("Mean 5-fold ROC AUC score for LR",Logit_Mean5)
print("Mean 5-fold ROC AUC score for Tuned LR",Logit_GS_Mean5)

print("Mean 5-fold ROC AUC score for GB", GB_Mean5)
print("Mean 5-fold ROC AUC score for Tuned GB", GB_GS_Mean5)

print("Mean 5-fold ROC AUC score for DT", DT_Mean5)
print("Mean 5-fold ROC AUC score for Tuned DT",DT_GS_Mean5)

print("Mean 5-fold ROC AUC score for RF", RF_Mean5)
print("Mean 5-fold ROC AUC score for Tuned RF", RF_GS_Mean5)



Mean 5-fold ROC AUC score for LR 0.52
Mean 5-fold ROC AUC score for Tuned LR 0.52
Mean 5-fold ROC AUC score for GB 0.52
Mean 5-fold ROC AUC score for Tuned GB 0.52
Mean 5-fold ROC AUC score for DT 0.5
Mean 5-fold ROC AUC score for Tuned DT 0.52
Mean 5-fold ROC AUC score for RF 0.51
Mean 5-fold ROC AUC score for Tuned RF 0.53
