# Data preparation for training Machine Learning Models 

* In this notebook we will be using the price + indicator data which has been prepared in the previous notebook. We will set the features columns and the target data column. Then we will set aside a part of the data for testing.

* We will use the GridSearchCV method of the scikit-learn library and check which model is giving the best score for training and validation

In [1]:
import pandas as pd
import datetime as dt 

### First read the data which has been prepared in the previous notebook 

In [2]:
df_data = pd.read_csv('Resources/Training_data.csv', index_col=0, infer_datetime_format=True)
df_data.index.set_names('Date', inplace=True)

df_data.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,Currency,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-06-01 06:00:00+01:00,14209.395508,14220.896484,14174.740234,14174.740234,14174.740234,0.0,0.99765,0.998526,31.704468,-0.814398,58.070115,0.0,0.857303,1.00189,1.000752,BTC/AUD,-0.002051
2020-06-01 07:00:00+01:00,14172.333984,14180.751953,14137.945312,14153.839844,14153.839844,149676032.0,0.997687,0.860287,-112.594868,-0.940319,58.808406,0.0,0.842866,1.003379,0.999655,BTC/AUD,-0.001474


### Our target value needs to be the returns of the next timeperiod, so transforming the data accordingly

In [3]:
df_data['Target_returns'] = df_data.Returns.shift(-1)
df_data.dropna(inplace=True)
df_data['Buy_or_sell'] = df_data.Target_returns.apply(lambda x: 1 if x > 0 else -1)
df_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,Currency,Returns,Target_returns,Buy_or_sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-06-01 06:00:00+01:00,14209.395508,14220.896484,14174.740234,14174.740234,14174.740234,0.0,0.99765,0.998526,31.704468,-0.814398,58.070115,0.0,0.857303,1.00189,1.000752,BTC/AUD,-0.002051,-0.001474,-1
2020-06-01 07:00:00+01:00,14172.333984,14180.751953,14137.945312,14153.839844,14153.839844,149676032.0,0.997687,0.860287,-112.594868,-0.940319,58.808406,0.0,0.842866,1.003379,0.999655,BTC/AUD,-0.001474,-0.002345,-1
2020-06-01 08:00:00+01:00,14151.363281,14168.582031,14120.649414,14120.649414,14120.649414,0.0,0.997819,0.685488,-112.539621,-1.100908,60.462916,0.0,0.845724,1.00591,0.99865,BTC/AUD,-0.002345,0.003653,1
2020-06-01 09:00:00+01:00,14122.041016,14214.550781,14122.041016,14172.235352,14172.235352,0.0,0.997725,1.224217,-9.034743,-0.919961,51.224342,0.0,0.956996,1.001426,1.002444,BTC/AUD,0.003653,0.00144,1
2020-06-01 10:00:00+01:00,14172.740234,14199.336914,14154.634766,14192.637695,14192.637695,0.0,0.997834,1.322032,61.996872,-0.712305,43.833484,0.0,0.928089,0.999619,1.003835,BTC/AUD,0.00144,0.000503,1


In [4]:
# df_data.columns.to_list()

# Setting parameters for training and testing  

### Variables:

* curr_list: The currency / currencies for which we want to create the ML model
* indicators_list: The indicators which we will be using as Features
* model_for_testing: One of 'svc' /  'dec_tree' / 'logreg' / 'forest' / 'grad_boost' / 'ada_boost'

In [5]:
curr_list = ['ADA/AUD', 'XLM/AUD']         # 'ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD', 'BCH/AUD'
indicators_list = ['ATR_ratio', 'BBands_high', 'BBands_low', 'CCI', 'MACD_ratio', 'SMA_agg', 'RSI_ratio']
model_for_testing = 'dec_tree'

# Preparing the data for training the classifier models

In [6]:
df_filtered = df_data.loc[ df_data.Currency.isin(curr_list) ]
df_filtered.shape

(18860, 19)

In [7]:
X = df_filtered.loc[:,['ATR_ratio', 'BBands_high', 'BBands_low', 'CCI', 'MACD_ratio', 'SMA_agg', 'RSI_ratio']].reset_index(drop=True)        # , 'CCI', 'Returns', 'SMA_agg', 'RSI_ratio', 'ADX_dirn', 
y = df_filtered.Buy_or_sell
X.shape

(18860, 7)

In [8]:
y.value_counts()


-1    9455
 1    9405
Name: Buy_or_sell, dtype: int64

### The data is imbalanced, usine SMOTE resampler

In [9]:
from imblearn.over_sampling import SMOTE
resampler = SMOTE(random_state= 1)
X , y = resampler.fit_resample(X , y)
y.value_counts()

 1    9455
-1    9455
Name: Buy_or_sell, dtype: int64

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
# import xgboost as xgb 

svc = SVC()
dec_tree = DecisionTreeClassifier()
logreg = LogisticRegression( solver='lbfgs')
forest = RandomForestClassifier( criterion='gini')
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()
# xgboost = 

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA

## Attempt 1 - Using GridSearchCV for 1 ML algo at a time

In [12]:
col_transform = make_column_transformer(
    (StandardScaler(), X.columns.to_list())
)
col_transform.fit_transform(X);

In [13]:
pca = PCA(n_components=3)

In [14]:
# chain sequential steps together
from sklearn.pipeline import make_pipeline, Pipeline

if model_for_testing == 'svc': model = ('svc', svc)
elif model_for_testing == 'logreg': model = ('logreg', logreg)
elif model_for_testing == 'dec_tree': model = ('dec_tree', dec_tree)
elif model_for_testing == 'forest': model = ('forest', forest)
elif model_for_testing == 'grad_boost': model = ('grad_boost', grad_boost)
elif model_for_testing == 'ada_boost': model = ('ada_boost', ada_boost)

pipe = Pipeline(steps= [('col_transform', col_transform), 
                    # ('pca', pca),
                    model
                    ])

# (col_transform, dec_tree)

In [15]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_roc_auc = cross_val_score(pipe, X, y, cv=10, scoring='roc_auc', n_jobs=20).mean()
cross_val_roc_auc


0.5070861997606183

In [16]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_accuracy = cross_val_score(pipe, X, y, cv=10, scoring='accuracy', n_jobs=20).mean()
cross_val_accuracy

0.5087255420412481

# Attempt 1: GridSearch using 1 classifier at a time

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
params = {}

if model_for_testing == 'logreg':
    params['logisticregression__C'] = [0.5, 0.75, 1, 1.25, 1.5]
    params['logisticregression__penalty'] = ['l1', 'l2']


elif model_for_testing == 'svc': 
    params['svc__C'] = [0.5, 0.75, 1, 1.25, 1.5]
    params['dec_tree__kernel'] = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

elif model_for_testing == 'dec_tree': 
    params['dec_tree__criterion'] = ['gini']
    params['dec_tree__max_depth'] = list(range(3,8,1))


elif model_for_testing == 'forest':
    params['forest__n_estimators'] = list(range(100,150,10))
    params['forest__max_depth'] = list(range(3,8,1))
    params['forest__max_features'] = ['auto', 'sqrt', 'log2']


elif model_for_testing == 'grad_boost': 
    params['grad_boost__learning_rate'] = [0.1, 0.3, 0.5]
    params['grad_boost__n_estimators'] = list(range(100,200,10))
    params['grad_boost__max_features'] = ['auto', 'sqrt', 'log2']
    params['grad_boost__max_depth'] = list(range(3,8,1))
    params['grad_boost__loss'] = ['deviance', 'exponential']


elif model_for_testing == 'ada_boost': 
    params['ada_boost__n_estimators'] = list(range(100,200,10))
    params['ada_boost__learning_rate'] = [0.1, 0.5, 1, 2]
    params['ada_boost__algorithm'] = ['SAMME.R']







In [19]:
grid = GridSearchCV(pipe, params, cv=10, scoring='accuracy', n_jobs=20)
grid.fit(X,y);

In [44]:
print(f'Score: {grid.best_score_}')
print(f'Best params: {grid.best_params_}')
estimator = grid.best_estimator_[model_for_testing]

grid_best_params = str(grid.best_params_)
grid_best_params
gridcv_best_score = grid.best_score_

Score: 0.5251189846641988
Best params: {'dec_tree__criterion': 'gini', 'dec_tree__max_depth': 4}


### Fitting the pipeline, with the tuned model

In [21]:
pipeline = make_pipeline(col_transform, estimator)
pipeline.fit(X, y)


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['ATR_ratio', 'BBands_high',
                                                   'BBands_low', 'CCI',
                                                   'MACD_ratio', 'SMA_agg',
                                                   'RSI_ratio'])])),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=4))])

# Testing

### Get the testing data first

In [22]:
df_testing_data = pd.read_csv('Resources/Testing_data.csv', index_col=0, infer_datetime_format=True)
df_testing_data.index.set_names('Date', inplace=True)

df_testing_data.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,Currency,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-07-28 02:00:00,3136.77,3190.84,3131.56,3145.08,7.991239,1.009907,1.234023,158.656349,-1.145074,37.210854,1.0,1.174444,0.999321,1.012981,ETH/AUD,0.013097
2021-07-28 03:00:00,3117.26,3143.68,3103.8,3143.68,0.678257,1.010234,1.2223,29.689609,-68.521925,34.568352,1.0,1.143174,1.000394,1.012479,ETH/AUD,-0.000445


### Setting target values

In [23]:
df_testing_data['Target_returns'] = df_testing_data.Returns.shift(-1)
df_testing_data.dropna(inplace=True)
df_testing_data['Buy_or_sell'] = df_testing_data.Target_returns.apply(lambda x: 1 if x > 0 else -1)
df_testing_data.shape

(4224, 18)

In [24]:
df_testing_subset = df_testing_data.loc[ df_testing_data.Currency.isin(curr_list) ] 
X_test = df_testing_subset.loc[: , indicators_list].reset_index(drop=True)   
y_test = df_testing_subset.loc[:, ['Target_returns', 'Buy_or_sell']].copy()

print(f'{X_test.shape}; {y_test.shape}')

(1410, 7); (1410, 2)


In [25]:
df_pred = y_test
df_pred['Pred_buy_or_sell'] = pipeline.predict(X_test)
df_pred.head()

Unnamed: 0_level_0,Target_returns,Buy_or_sell,Pred_buy_or_sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-07-28 02:00:00,0.0,-1,1
2021-07-28 03:00:00,0.0,-1,1
2021-07-28 04:00:00,0.0,-1,1
2021-07-28 05:00:00,-0.002065,-1,1
2021-07-28 06:00:00,0.019316,1,-1


In [30]:
from sklearn.metrics import classification_report
from imblearn.metrics import classification_report_imbalanced

y_pred = pipeline.predict(X_test)
df_predictions = pd.DataFrame(y_pred, columns=['Buy'])

print(classification_report(y_test.Buy_or_sell, y_pred))

              precision    recall  f1-score   support

          -1       0.72      0.63      0.67      1014
           1       0.28      0.36      0.31       396

    accuracy                           0.55      1410
   macro avg       0.50      0.50      0.49      1410
weighted avg       0.59      0.55      0.57      1410



### Getting the total returns when the strategy gave a buy signal 

In [36]:
total_returns_pred_buy = df_pred.loc[df_pred.Pred_buy_or_sell == 1].copy()
total_pnl = total_returns_pred_buy.Target_returns.sum()

# Writing the outcomes to a CSV

In [50]:
currency = ' '.join(curr_list)
indicators = ','.join(indicators_list)
model_tested = model[0]
cross_val_roc_auc
cross_val_accuracy
total_pnl
grid_best_params
gridcv_best_score

0.5251189846641988

In [61]:
df_outcomes = pd.read_csv('Resources/GridSearch_test_outcomes.csv', index_col=0)

df_outcomes
# df_outcomes = df_outcomes.append()
df_outcomes = df_outcomes.append(pd.Series([ 
    currency, 
    indicators, 
    model_tested, 
    cross_val_roc_auc, 
    cross_val_accuracy, 
    gridcv_best_score, 
    grid_best_params,
    total_pnl
], index= df_outcomes.columns), ignore_index= True)
df_outcomes.to_csv('Resources/GridSearch_test_outcomes.csv')