# Data preparation for training Machine Learning Models 

* In this notebook we will be using the price + indicator data which has been prepared in the previous notebook. We will set the features columns and the target data column. Then we will set aside a part of the data for testing.

* We will use the GridSearchCV method of the scikit-learn library and check which model is giving the best score for training and validation

In [34]:
import pandas as pd
import datetime as dt 

### First read the data which has been prepared in the previous notebook 

In [35]:
df_data = pd.read_csv('Resources/Training_data.csv', index_col=0, infer_datetime_format=True)
df_data.index.set_names('Date', inplace=True)

df_data.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,SMA_vol_agg,Currency,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-09-18 15:00:00+01:00,14900.219727,14925.80957,14880.480469,14903.679688,14903.679688,122473.0,0.998842,0.743948,-68.87572,-10.741767,51.022842,0.0,1.049068,1.001707,1.000276,0.89846,BTC/AUD,0.000755
2019-09-18 16:00:00+01:00,14897.629883,14919.230469,14879.599609,14879.599609,14879.599609,105416.0,0.998215,0.594412,-136.69359,-2.693687,55.359342,0.0,0.995113,1.002712,0.999494,0.913184,BTC/AUD,-0.001616


### Our target value needs to be the returns of the next timeperiod, so transforming the data accordingly

In [36]:
df_data['Target_returns'] = df_data.Returns.shift(-1)
df_data.dropna(inplace=True)
df_data['Buy_or_sell'] = df_data.Target_returns.apply(lambda x: 1 if x > 0 else -1)
df_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,SMA_vol_agg,Currency,Returns,Target_returns,Buy_or_sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-09-18 15:00:00+01:00,14900.219727,14925.80957,14880.480469,14903.679688,14903.679688,122473.0,0.998842,0.743948,-68.87572,-10.741767,51.022842,0.0,1.049068,1.001707,1.000276,0.89846,BTC/AUD,0.000755,-0.001616,-1
2019-09-18 16:00:00+01:00,14897.629883,14919.230469,14879.599609,14879.599609,14879.599609,105416.0,0.998215,0.594412,-136.69359,-2.693687,55.359342,0.0,0.995113,1.002712,0.999494,0.913184,BTC/AUD,-0.001616,0.000981,1
2019-09-18 17:00:00+01:00,14897.839844,14945.879883,14849.950195,14894.200195,14894.200195,86641.0,0.998267,0.776254,-59.095027,-1.715412,60.84467,0.0,1.08026,1.00135,1.000698,0.859538,BTC/AUD,0.000981,-0.000461,-1
2019-09-18 18:00:00+01:00,14902.620117,14922.509766,14874.450195,14887.330078,14887.330078,70603.0,0.998071,0.724499,-55.375089,-1.418463,65.232933,0.0,1.037463,1.00081,1.000257,0.710317,BTC/AUD,-0.000461,0.002185,1
2019-09-18 19:00:00+01:00,14913.389648,14961.69043,14910.490234,14919.860352,14919.860352,53569.0,0.998493,1.017354,166.666667,-0.845198,52.927061,0.0,1.06201,0.999398,1.002477,0.661374,BTC/AUD,0.002185,0.001123,1


In [37]:
df_data.Target_returns.describe()

count    107682.000000
mean          0.000168
std           0.012098
min          -0.331738
25%          -0.003940
50%           0.000056
75%           0.004199
max           0.289606
Name: Target_returns, dtype: float64

In [38]:
# df_data.columns.to_list()

# Setting parameters for training and testing  

Variables:

* curr_list: The currency / currencies for which we want to create the ML model
* indicators_list: The indicators which we will be using as Features
* model_for_testing: One of 'svc' /  'dec_tree' / 'logreg' / 'forest' / 'grad_boost' / 'ada_boost'

In [39]:
curr_list = [ 'ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD', 'BCH/AUD' ]         # 'ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD', 'BCH/AUD'
indicators_list = ['BBands_high', 'BBands_low', 'RSI_ratio', 'CCI','ADX', 'ADX_dirn', 'SMA_vol_agg', 'MACD_ratio']
model_for_testing = 'ada_boost'

# Preparing the data for training the classifier models

In [40]:
df_filtered = df_data.loc[ df_data.Currency.isin(curr_list) ]
df_filtered.shape

(92246, 20)

In [41]:
X = df_filtered.loc[:,indicators_list].reset_index(drop=True)        # , 'CCI', 'Returns', 'SMA_agg', 'RSI_ratio', 'ADX_dirn', 
y = df_filtered.Buy_or_sell
y.shape

(92246,)

In [42]:
y.value_counts()

 1    46412
-1    45834
Name: Buy_or_sell, dtype: int64

### The data is imbalanced, usine SMOTE resampler

In [43]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# resampler = SMOTE(random_state= 1)
combi_sampler = SMOTEENN(random_state=42)
X , y = combi_sampler.fit_resample(X , y)
y.value_counts()


-1    6311
 1    6174
Name: Buy_or_sell, dtype: int64

# Start of ML training

## Importing libraries

In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
# import xgboost as xgb 

svc = SVC()
dec_tree = DecisionTreeClassifier()
logreg = LogisticRegression( )
forest = RandomForestClassifier( )
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()
# xgboost = 

In [45]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline, Pipeline


## Define pipelines

In [46]:
col_transform = make_column_transformer(
    (StandardScaler(), X.columns.to_list())
)
col_transform.fit_transform(X);

In [47]:
pca = PCA(n_components=3)

In [48]:
if model_for_testing == 'svc': model = ('svc', svc)
elif model_for_testing == 'logreg': model = ('logreg', logreg)
elif model_for_testing == 'dec_tree': model = ('dec_tree', dec_tree)
elif model_for_testing == 'forest': model = ('forest', forest)
elif model_for_testing == 'grad_boost': model = ('grad_boost', grad_boost)
elif model_for_testing == 'ada_boost': model = ('ada_boost', ada_boost)

pipe = Pipeline(steps= [('col_transform', col_transform), 
                    ('pca', pca),
                    model
                    ])

# (col_transform, dec_tree)

# Step 1 - Get Cross Validation score by running the model with the default parameters

In [49]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_roc_auc = cross_val_score(pipe, X, y, cv=10, scoring='roc_auc', n_jobs=20).mean()
cross_val_roc_auc


0.6256669318906523

In [50]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_accuracy = cross_val_score(pipe, X, y, cv=10, scoring='accuracy', n_jobs=20).mean()
cross_val_accuracy

0.6048089753854365

# Step 2: Call GridSearchCV with a range of parameters

In [51]:
from sklearn.model_selection import GridSearchCV

In [55]:
params = {}

if model_for_testing == 'logreg':
    params['logreg__solver'] = ['liblinear', 'lbfgs']
    params['logreg__C'] = [0.5, 0.75, 1, 1.25, 1.5]
    params['logreg__penalty'] = ['l1', 'l2']


elif model_for_testing == 'svc': 
    params['svc__C'] = [0.5, 0.75, 1, 1.25, 1.5]
    params['svc__kernel'] = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

elif model_for_testing == 'dec_tree': 
    params['dec_tree__criterion'] = ['gini', 'entropy']
    params['dec_tree__max_depth'] = list(range(3,8,1))


elif model_for_testing == 'forest':
    params['forest__n_estimators'] = list(range(100,150,10))
    params['forest__max_depth'] = list(range(3,8,1))
    params['forest__max_features'] = ['auto', 'sqrt', 'log2']


elif model_for_testing == 'grad_boost': 
    params['grad_boost__learning_rate'] = [0.075, 0.1, 0.25, 0.5]
    params['grad_boost__n_estimators'] = list(range(100,200,10))
    params['grad_boost__max_features'] = ['auto', 'sqrt', 'log2']
    params['grad_boost__max_depth'] = list(range(3,8,1))
    params['grad_boost__loss'] = ['deviance', 'exponential']


elif model_for_testing == 'ada_boost': 
    params['ada_boost__n_estimators'] = list(range(100,200,10))
    params['ada_boost__learning_rate'] = [0.03, 0.05, 0.075 ]
    params['ada_boost__algorithm'] = ['SAMME.R']



### Calling GridSearch

In [56]:
grid = GridSearchCV(pipe, params, cv=10, scoring='roc_auc', n_jobs=20)
grid.fit(X,y);

### Displaying results

In [57]:
print(f'Score: {grid.best_score_}')
print(f'Best params: {grid.best_params_}')
estimator = grid.best_estimator_[model_for_testing]

grid_best_params = str(grid.best_params_)
grid_best_params
gridcv_best_score = grid.best_score_

Score: 0.627912968832462
Best params: {'ada_boost__algorithm': 'SAMME.R', 'ada_boost__learning_rate': 0.05, 'ada_boost__n_estimators': 170}


### Fitting the pipeline, with the tuned model

In [58]:
pipeline = make_pipeline(col_transform, pca, estimator)
pipeline.fit(X, y)


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['BBands_high', 'BBands_low',
                                                   'RSI_ratio', 'CCI', 'ADX',
                                                   'ADX_dirn', 'SMA_vol_agg',
                                                   'MACD_ratio'])])),
                ('pca', PCA(n_components=3)),
                ('adaboostclassifier',
                 AdaBoostClassifier(learning_rate=0.05, n_estimators=170))])

# Save the fitted pipeline to a joblib file

In [59]:
from joblib import dump, load
from pathlib import Path

filename = Path('Joblibs/' + dt.date.today().isoformat() + '_' + model_for_testing + '.joblib')
dump(pipeline, filename)


['Joblibs\\2021-08-28_ada_boost.joblib']

# Testing

### Get the testing data first

In [60]:
df_testing_data = pd.read_csv('Resources/Testing_data.csv', index_col=0, infer_datetime_format=True)
df_testing_data.index.set_names('Date', inplace=True)

df_testing_data.head(2)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,SMA_vol_agg,Currency,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-07-30 02:00:00,3311.76,3311.76,3286.35,3286.35,1.252975,1.023521,1.022101,87.14359,1.623762,65.634756,1.0,1.44862,1.002131,1.029719,1.471938,ETH/AUD,-0.005643
2021-07-30 03:00:00,3284.09,3284.09,3281.31,3281.31,0.613744,1.027185,0.996889,41.18537,1.337463,68.393673,1.0,1.269047,1.005115,1.014386,1.594175,ETH/AUD,-0.001534


### Setting target values

In [61]:
df_testing_data['Target_returns'] = df_testing_data.Returns.shift(-1)
df_testing_data.dropna(inplace=True)
df_testing_data['Buy_or_sell'] = df_testing_data.Target_returns.apply(lambda x: 1 if x > 0 else -1)
df_testing_data.shape

(4215, 19)

### Setting a separate currency list for testing

In [62]:

curr_list_test = curr_list

In [63]:
df_testing_subset = df_testing_data.loc[ df_testing_data.Currency.isin(curr_list_test) ] 
X_test = df_testing_subset.loc[: , indicators_list].reset_index(drop=True)   
y_test = df_testing_subset.loc[:, ['Target_returns', 'Buy_or_sell']].copy()

print(f'{X_test.shape}; {y_test.shape}')

(4215, 8); (4215, 2)


## Load the joblib file

In [64]:
pipeline = load(filename)

In [65]:
df_pred = y_test
df_pred['Pred_buy_or_sell'] = pipeline.predict(X_test)
df_pred.head()

Unnamed: 0_level_0,Target_returns,Buy_or_sell,Pred_buy_or_sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-07-30 02:00:00,-0.001534,-1,-1
2021-07-30 03:00:00,-0.002517,-1,-1
2021-07-30 04:00:00,-0.003006,-1,-1
2021-07-30 05:00:00,-0.004557,-1,1
2021-07-30 06:00:00,0.0,-1,1


In [66]:
from sklearn.metrics import classification_report
from imblearn.metrics import classification_report_imbalanced

y_pred = pipeline.predict(X_test)
df_predictions = pd.DataFrame(y_pred, columns=['Buy'])

print(classification_report(y_test.Buy_or_sell, y_pred))

              precision    recall  f1-score   support

          -1       0.70      0.51      0.59      2867
           1       0.34      0.54      0.42      1348

    accuracy                           0.52      4215
   macro avg       0.52      0.53      0.50      4215
weighted avg       0.59      0.52      0.53      4215



### Getting the total returns when the strategy gave a buy signal 

In [67]:
total_returns_pred_buy = df_pred.loc[df_pred.Pred_buy_or_sell == 1].copy()
total_pnl = total_returns_pred_buy.Target_returns.sum()
total_pnl

2.4755899228508875

# Writing the outcomes to a CSV

In [68]:
currency = ' '.join(curr_list)
indicators = ','.join(indicators_list)
model_tested = model[0]
cross_val_roc_auc
cross_val_accuracy
total_pnl
grid_best_params
gridcv_best_score

0.627912968832462

In [69]:
df_outcomes = pd.read_csv('Resources/GridSearch_test_outcomes.csv', index_col=0)

df_outcomes
# df_outcomes = df_outcomes.append()
df_outcomes = df_outcomes.append(pd.Series([ 
    currency, 
    indicators, 
    model_tested, 
    cross_val_roc_auc, 
    cross_val_accuracy, 
    gridcv_best_score, 
    grid_best_params,
    total_pnl
], index= df_outcomes.columns), ignore_index= True)
df_outcomes.to_csv('Resources/GridSearch_test_outcomes.csv')