# `Unfiltered dataset: Importing libraries and the dataset`

In [1]:
!pip install scikit-plot 
#data manipulation and visualization libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

#libraries for modelling and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import KFold
import scikitplot as skplt 
import warnings
warnings.filterwarnings('ignore')

Collecting scikit-plot
  Downloading https://files.pythonhosted.org/packages/7c/47/32520e259340c140a4ad27c1b97050dd3254fdc517b1d59974d47037510e/scikit_plot-0.3.7-py3-none-any.whl
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


  import pandas.util.testing as tm


In [0]:
#importing the data and making a dataframe
data = pd.read_csv('https://raw.githubusercontent.com/IrenOkminyan/BUS288/master/ETH_USDT.csv')
data.dropna(inplace=True)
data=data.drop(['close_10.0_le_5_c', 'open_2_d'],axis=1)
data.set_index(keys='Close_time', inplace=True)
data.target=data.target.map({-1:0, 1:1})

data_lr = data
data_gb = data
data_dt = data
data_rf = data

# `Gradient Boosting`

In [0]:
#making dummes,train test split
data_gb = pd.get_dummies(data_gb, drop_first=True)

In [0]:
#separating X and Y
Y = data_gb["target"]
X = data_gb.drop("target",axis=1)

In [0]:
#let's split X and Y data into test and train datasets, test data is 25% of overall data
#splitting the data into train and test sets
X0, X1, Y0, Y1 = train_test_split(X, Y, test_size=0.25, shuffle = False)

In [6]:
#building and fitting the model
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X0,Y0)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [0]:
#let's see the order of classes
gb.classes_

Y0_gb = gb.predict_proba(X0)[:,1]
Y1_gb = gb.predict_proba(X1)[:,1]

In [8]:
#calculating roc auc score using 
GB_Train = roc_auc_score(Y0,Y0_gb).round(2)
GB_Test = roc_auc_score(Y1,Y1_gb).round(2)
GB_Mean5 = np.mean(cross_val_score(estimator=gb,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train:", GB_Train)
print("ROC_AUC Test:", GB_Train)
print("Mean 5-fold ROC AUC score for GB", GB_Mean5)

ROC_AUC Train: 0.69
ROC_AUC Test: 0.69
Mean 5-fold ROC AUC score for GB 0.54


In [9]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=gb.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.014033
open_-2_r,0.047418
cr,0.023158
cr-ma1,0.02148
cr-ma2,0.023855
cr-ma3,0.021027
"volume_-3,2,-1_max",0.060844
volume_-3~1_min,0.018804
kdjk,0.016437
kdjd,0.030164


# `Tuned Gradient Boosting Classifier`

In [10]:
#building GridSearch with Logistic Regression
gs_gb = GradientBoostingClassifier(random_state=42)

param_gb = { "max_depth":[3,5,8] }


gb_gs = GridSearchCV(estimator = gs_gb,
                      param_grid = param_gb,
                      scoring = "roc_auc", cv=KFold(n_splits=5, shuffle=False, random_state=None), verbose=1, n_jobs=-1).fit(X0,Y0)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.3min finished


In [11]:
#finding out the best parameters
gb_gs.best_params_

{'max_depth': 5}

In [0]:
#building logit with best params and seeing the results
pm = gb_gs.best_params_
gb_grid = GradientBoostingClassifier(max_depth = pm['max_depth'],random_state=42).fit(X0,Y0)

Y0_gb_gs = gb_grid.predict_proba(X0)[:,1]
Y1_gb_gs = gb_grid.predict_proba(X1)[:,1]


In [13]:
#calculating roc auc score using
GB_GS_Train = roc_auc_score(Y0,Y0_gb_gs).round(2)
GB_GS_Test = roc_auc_score(Y1,Y1_gb_gs).round(2)
GB_GS_Mean5 = np.mean(cross_val_score(estimator=gb_grid,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train for tuned GB:", GB_GS_Train)
print("ROC_AUC Test for tuned GB:", GB_GS_Test)
print("Mean 5-fold ROC AUC score for Tuned GB", GB_GS_Mean5)

ROC_AUC Train for tuned GB: 0.86
ROC_AUC Test for tuned GB: 0.55
Mean 5-fold ROC AUC score for Tuned GB 0.54


In [14]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=gb_grid.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.029312
open_-2_r,0.047414
cr,0.03235
cr-ma1,0.018906
cr-ma2,0.01943
cr-ma3,0.028279
"volume_-3,2,-1_max",0.046157
volume_-3~1_min,0.023581
kdjk,0.021316
kdjd,0.026431


# `Decision Tree`

In [0]:
data_dt = pd.get_dummies(data_dt,drop_first=True)

Y = data_dt["target"]
X = data_dt.drop("target",axis=1)

X0, X1, Y0, Y1 = train_test_split(X,Y,test_size =0.25, shuffle = False)

In [16]:
#building DT with default hyperparameters
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X0,Y0)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [0]:
#calculating probabilities
Y0_dt=dt.predict_proba(X0)[:,1]
Y1_dt=dt.predict_proba(X1)[:,1]

In [18]:
#printing the ROC AUC scores for training and testing sets
DT_Train = roc_auc_score(Y0,Y0_dt).round(2)
DT_Test = roc_auc_score(Y1,Y1_dt).round(2)
DT_Mean5 = np.mean(cross_val_score(estimator = dt,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1, scoring="roc_auc")).round(2)


print("ROC_AUC Train for fully grown DT:",DT_Train)
print("ROC_AUC Test for fully grown DT:", DT_Test)
print("Mean 5-fold ROC AUC score for Decision Tree", DT_Mean5)

ROC_AUC Train for fully grown DT: 1.0
ROC_AUC Test for fully grown DT: 0.53
Mean 5-fold ROC AUC score for Decision Tree 0.51


In [19]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=dt.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.026888
open_-2_r,0.047374
cr,0.033913
cr-ma1,0.017059
cr-ma2,0.021518
cr-ma3,0.032351
"volume_-3,2,-1_max",0.040232
volume_-3~1_min,0.025963
kdjk,0.020194
kdjd,0.027843


# `Filtered dataset: Importing libraries and the dataset`

In [0]:
 #importing the data and making a dataframe
data = pd.read_csv('https://raw.githubusercontent.com/IrenOkminyan/BUS288/master/ETH_USDT.csv')
data.dropna(inplace=True)
data=data.drop(['close_10.0_le_5_c'],axis=1)
data=data.drop(['close_10.0_ge_5_fc', 'cci_20', 'wr_6', 'rsi_12', 'trix_9_sma', 'open_2_sma', 'macds', 'boll', 'boll_lb', 'kdjj', 'kdjd', 'rsi_6', 'wr_10', 'cci', 'macd', 'boll_ub', 'adx', 'adxr', 'tema'],axis=1)
data=data.drop(['atr', 'vr', 'vr_6_sma','Open', 'Close', 'High', 'Low', 'volume_-3~1_min', 'kdjk', 'trix', 'cr', 'cr-ma1', 'cr-ma2', 'cr-ma3'], axis = 1)
data=data.drop(['Volume', 'Number_of_trades', 'mdi', 'tr', 'volume_-3,2,-1_max', 'open_2_d'], axis = 1)

data.set_index(keys='Close_time', inplace=True)
data.target=data.target.map({-1:0, 1:1})

data_lr = data
data_gb = data
data_dt = data
data_rf = data

# `Tuned Gradient Boosting Classifier`

In [35]:
#building GridSearch with Logistic Regression
gs_gb = GradientBoostingClassifier(random_state=42)

param_gb = { "max_depth":[3,5,8] }


gb_gs = GridSearchCV(estimator = gs_gb,
                      param_grid = param_gb,
                      scoring = "roc_auc", cv=KFold(n_splits=5, shuffle=False, random_state=None), verbose=1, n_jobs=-1).fit(X0,Y0)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   48.6s finished


In [36]:
#finding out the best parameters
gb_gs.best_params_

{'max_depth': 5}

In [0]:
#building logit with best params and seeing the results
pm = gb_gs.best_params_
gb_grid = GradientBoostingClassifier( max_depth = pm['max_depth'],random_state=42).fit(X0,Y0)

Y0_gb_gs = gb_grid.predict_proba(X0)[:,1]
Y1_gb_gs = gb_grid.predict_proba(X1)[:,1]


In [38]:
#calculating roc auc score using
GB_GS_Train = roc_auc_score(Y0,Y0_gb_gs).round(2)
GB_GS_Test = roc_auc_score(Y1,Y1_gb_gs).round(2)
GB_GS_Mean5 = np.mean(cross_val_score(estimator=gb_grid,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)

print("ROC_AUC Train for tuned GB:", GB_GS_Train)
print("ROC_AUC Test for tuned GB:", GB_GS_Test)
print("Mean 5-fold ROC AUC score for Tuned GB", GB_GS_Mean5)

ROC_AUC Train for tuned GB: 0.8
ROC_AUC Test for tuned GB: 0.53
Mean 5-fold ROC AUC score for Tuned GB 0.52


In [39]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=gb_grid.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.173608
open_-2_r,0.158597
macdh,0.161819
cr-ma2_xu_cr-ma1_20_c,0.0
dma,0.150363
pdi,0.186871
dx,0.168742


# `Tuned Random Forest`

In [0]:
data_rf = pd.get_dummies(data_rf,drop_first=True)

Y = data_rf["target"]
X = data_rf.drop("target",axis=1)

X0, X1, Y0, Y1 = train_test_split(X,Y,test_size =0.25, shuffle = False)

In [0]:
#setting up parameters for DT's GridSearch
param_dt={"max_depth":range(1,9),
          "min_samples_leaf":range(10,150,10),
          "class_weight":["balanced",None]        
            }

In [0]:
#building GridSearch for RF using DT's hyperparameters ranges
gs_rf = GridSearchCV(estimator = RandomForestClassifier(random_state=42),
                  param_grid = param_dt, scoring="roc_auc", cv=KFold(n_splits=5, shuffle=False, random_state=None), n_jobs=-1).fit(X0,Y0)

In [25]:
 #outlining best parameters
gs_rf.best_params_

{'class_weight': None, 'max_depth': 5, 'min_samples_leaf': 140}

In [0]:
#fitting the model and calculating probabilities
pm = gs_rf.best_params_
rf_grid = RandomForestClassifier(class_weight = pm['class_weight'], max_depth = pm['max_depth'], min_samples_leaf = pm['min_samples_leaf'], random_state=42).fit(X0,Y0)

Y0_rf_grid=rf_grid.predict_proba(X0)[:,1]
Y1_rf_grid=rf_grid.predict_proba(X1)[:,1]

In [27]:
#printing the ROC AUC scores for training and testing sets
RF_GS_Train = roc_auc_score(Y0,Y0_rf_grid).round(2)
RF_GS_Test = roc_auc_score(Y1,Y1_rf_grid).round(2)
RF_GS_Mean5 = np.mean(cross_val_score(estimator = rf_grid,X=X,y=Y,cv=KFold(n_splits=5, shuffle=False, random_state=None),n_jobs=-1, scoring="roc_auc")).round(2)


print("ROC_AUC Train for tuned RF:",RF_GS_Train)
print("ROC_AUC Test for tuned RF:", RF_GS_Test)
print("Mean 5-fold ROC AUC score for tuned RF", RF_GS_Mean5)

ROC_AUC Train for tuned RF: 0.59
ROC_AUC Test for tuned RF: 0.55
Mean 5-fold ROC AUC score for tuned RF 0.53


In [28]:
#retrieving the importances of features(independent variables)
pd.DataFrame(data=rf_grid.feature_importances_,index=X0.columns)

Unnamed: 0,0
volume_delta,0.145147
open_-2_r,0.197188
macdh,0.143481
cr-ma2_xu_cr-ma1_20_c,0.0
dma,0.158925
pdi,0.202395
dx,0.152864


# `Summary`

In [40]:
#Picking the best model based on mean 5-fold cross validation score.

print('Unfiltered dataset:')
print("Mean 5-fold ROC AUC score for GB", GB_Mean5)
print("Mean 5-fold ROC AUC score for Tuned GB", GB_GS_Mean5)
print("Mean 5-fold ROC AUC score for DT", DT_Mean5)

print('Filtered dataset:')
print("Mean 5-fold ROC AUC score for Tuned GB", GB_GS_Mean5)
print("Mean 5-fold ROC AUC score for Tuned RF", RF_GS_Mean5)



Unfiltered dataset:
Mean 5-fold ROC AUC score for GB 0.54
Mean 5-fold ROC AUC score for Tuned GB 0.52
Mean 5-fold ROC AUC score for DT 0.51
Filtered dataset:
Mean 5-fold ROC AUC score for Tuned GB 0.52
Mean 5-fold ROC AUC score for Tuned RF 0.53
