# Data preparation for training Machine Learning Models 

* In this notebook we will be using the price + indicator data which has been prepared in the previous notebook. We will set the features columns and the target data column. Then we will set aside a part of the data for testing.

* We will use the GridSearchCV method of the scikit-learn library and check which model is giving the best score for training and validation

In [1]:
import pandas as pd
import datetime as dt 

### First read the data which has been prepared in the previous notebook 

In [2]:
df_data = pd.read_csv('Resources/Training_data.csv', index_col=0, infer_datetime_format=True)
df_data.index.set_names('Date', inplace=True)

df_data.tail(2)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,SMA_vol_agg,Currency,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-06-30 22:00:00+01:00,693.063843,698.176758,693.063843,694.500916,694.500916,3809792.0,1.012661,1.269279,69.589922,-6.895928,40.093125,1.0,0.845223,1.001346,1.00814,1.787579,BCH/AUD,0.001308
2021-06-30 23:00:00+01:00,695.193176,699.72876,692.296326,695.171204,695.171204,0.0,1.014853,1.282551,81.106419,8.261619,43.752081,1.0,0.840898,0.999544,1.002765,0.709602,BCH/AUD,0.000965


### Our target value needs to be the returns of the next timeperiod, so transforming the data accordingly

In [3]:
df_data['Target_returns'] = df_data.Returns.shift(-1)
df_data.dropna(inplace=True)
df_data['Buy_or_sell'] = df_data.Target_returns.apply(lambda x: 1 if x > 0 else 0)
df_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_agg,RSI_ratio,CCI,MACD_ratio,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,SMA_vol_agg,Currency,Returns,Target_returns,Buy_or_sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-09-18 21:00:00+01:00,14914.089844,14963.049805,14895.769531,14957.429688,14957.429688,61484.0,0.999928,1.15914,72.833673,-0.737363,39.780911,0.0,1.035198,0.999182,1.004327,0.537254,BTC/AUD,0.001393,-0.00116,0
2019-09-18 22:00:00+01:00,14958.599609,14966.950195,14925.269531,14940.080078,14940.080078,103132.0,1.000571,1.003061,79.504221,-1.52145,34.729405,0.0,0.986765,1.000795,1.002382,0.578897,BTC/AUD,-0.00116,-0.002885,0
2019-09-18 23:00:00+01:00,14940.080078,14948.669922,14896.980469,14896.980469,14896.980469,62138.0,1.00095,0.736963,-154.742871,-0.023342,36.938127,0.0,0.972607,1.003602,0.999144,0.596191,BTC/AUD,-0.002885,0.001635,1
2019-09-19 00:00:00+01:00,14895.379883,14944.849609,14895.379883,14921.330078,14921.330078,0.0,1.001008,0.915055,-62.564344,-0.062695,38.97846,0.0,0.956984,1.001975,1.000748,0.564452,BTC/AUD,0.001635,0.00237,1
2019-09-19 01:00:00+01:00,14920.610352,14963.330078,14910.509766,14956.700195,14956.700195,125696.0,1.001143,1.062606,63.367163,-8.345107,32.761174,0.0,0.953543,1.00005,1.003027,0.730576,BTC/AUD,0.00237,-0.002971,0


In [4]:
df_data.Currency.unique()

array(['BTC/AUD', 'ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD',
       'BCH/AUD'], dtype=object)

# Setting parameters for training and testing  

Variables:

* curr_list: The currency / currencies for which we want to create the ML model
* indicators_list: The indicators which we will be using as Features
* model_for_testing: One of 'svc' /  'dec_tree' / 'logreg' / 'forest' / 'grad_boost' / 'ada_boost'

In [5]:
curr_list = [ 'ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD', 'BCH/AUD' ]         # 'ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD', 'BCH/AUD'
# indicators_list = ['BBands_high', 'BBands_low', 'RSI_ratio', 'CCI','ADX', 'ADX_dirn', 'SMA_vol_agg', 'MACD_ratio']

# indicators_list = ['CCI', 'MACD_ratio', 'ADX', 'ADX_dirn','SMA_vol_agg']

all_inds = ['SMA_agg', 'RSI_ratio', 'CCI', 'MACD_ratio', 'ADX', 'ADX_dirn', 'ATR_ratio', 'BBands_high', 'BBands_low', 'SMA_vol_agg', 'Returns']

model_for_testing = 'forest'

# Preparing the data for training the classifier models

In [6]:
df_filtered = df_data.loc[ df_data.Currency.isin(curr_list) ]
df_filtered.shape

(92264, 20)

In [7]:
# X = df_filtered.loc[:,indicators_list].reset_index(drop=True)        # , 'CCI', 'Returns', 'SMA_agg', 'RSI_ratio', 'ADX_dirn', 
X = df_filtered.loc[:,all_inds].reset_index(drop=True)        
y = df_filtered.Buy_or_sell

In [8]:
y.value_counts()

1    46429
0    45835
Name: Buy_or_sell, dtype: int64

## Feature Selection Techniques

In [9]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, VarianceThreshold, chi2

Using VarianceThreshold with a value of 0.8

In [10]:
# var_t = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
# var_t.fit_transform(X)

# var_t_inds = var_t.get_support()
# best_inds = []
# for i in range(len(var_t_inds)):
#     if var_t_inds[i]:
#         best_inds += [all_inds[i]]

# best_inds

Using SelectKBest

In [11]:
k_best = SelectKBest(f_classif, k=4)

k_best.fit(X, y)
X_feat_sel = k_best.transform(X).shape

scores = {}
scores['ind'] = all_inds
scores['scores'] = k_best.scores_
k_b_df = pd.DataFrame(scores)
k_b_df.sort_values(by=['scores'], ascending=False, inplace=True)

best_inds = []

for i in range(0,8):
    ind = k_b_df.ind[i]
    best_inds.append( ind )

best_inds

['SMA_agg',
 'RSI_ratio',
 'CCI',
 'MACD_ratio',
 'ADX',
 'ADX_dirn',
 'ATR_ratio',
 'BBands_high']

## Resampling Data

In [12]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

# resampler = SMOTE(random_state= 1)
combi_sampler = SMOTEENN(random_state=42)
X , y = combi_sampler.fit_resample(X , y)
y.value_counts()

0    6403
1    6131
Name: Buy_or_sell, dtype: int64

# Start of ML training

## Importing libraries

In [13]:
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
# import xgboost as xgb 

svc = SVC()
dec_tree = DecisionTreeClassifier()
logreg = LogisticRegression( )
forest = RandomForestClassifier( )
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()
# xgboost = 

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline, Pipeline


## Define pipelines

In [15]:
col_transform = make_column_transformer(
    # (StandardScaler(), X.columns.to_list())
    (StandardScaler(), best_inds ),
    remainder='drop'
)
col_transform.fit_transform(X);

In [16]:
pca = PCA(n_components=3)

In [17]:
if model_for_testing == 'svc': model = ('svc', svc)
elif model_for_testing == 'logreg': model = ('logreg', logreg)
elif model_for_testing == 'dec_tree': model = ('dec_tree', dec_tree)
elif model_for_testing == 'forest': model = ('forest', forest)
elif model_for_testing == 'grad_boost': model = ('grad_boost', grad_boost)
elif model_for_testing == 'ada_boost': model = ('ada_boost', ada_boost)

pipe = Pipeline(steps= [('col_transform', col_transform), 
                    # ('pca', pca),
                    model
                    ])

# pipe 

# Step 1 - Get Cross Validation score by running the model with the default parameters

In [18]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_roc_auc = cross_val_score(pipe, X, y, cv=10, scoring='roc_auc', n_jobs=20).mean()
cross_val_roc_auc


0.6811211583388342

In [19]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_accuracy = cross_val_score(pipe, X, y, cv=10, scoring='accuracy', n_jobs=20).mean()
cross_val_accuracy

0.6316414449022506

# Step 2: Call GridSearchCV with a range of parameters

In [20]:
from sklearn.model_selection import GridSearchCV

In [24]:
params = {}

if model_for_testing == 'logreg':
    params['logreg__solver'] = ['liblinear', 'lbfgs']
    params['logreg__C'] = [0.5, 0.75, 1, 1.25, 1.5]
    params['logreg__penalty'] = ['l2']


elif model_for_testing == 'svc': 
    params['svc__C'] = [0.5, 0.75, 1, 1.25, 1.5]
    params['svc__kernel'] = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']

elif model_for_testing == 'dec_tree': 
    params['dec_tree__criterion'] = ['gini', 'entropy']
    params['dec_tree__max_depth'] = list(range(3,8,1))


elif model_for_testing == 'forest':
    params['forest__n_estimators'] = list(range(100,200,10))
    params['forest__max_depth'] = list(range(3,8,1))
    params['forest__max_features'] = ['auto', 'sqrt', 'log2']


elif model_for_testing == 'grad_boost': 
    params['grad_boost__learning_rate'] = [0.075, 0.1, 0.25, 0.5]
    params['grad_boost__n_estimators'] = list(range(100,200,10))
    params['grad_boost__max_features'] = ['sqrt', 'log2']
    params['grad_boost__max_depth'] = list(range(3,8,1))
    params['grad_boost__loss'] = ['deviance', 'exponential']


elif model_for_testing == 'ada_boost': 
    params['ada_boost__base_estimator'] = [DecisionTreeClassifier(), ExtraTreeClassifier(), LogisticRegression()]
    params['ada_boost__n_estimators'] = [ 200, 250, 500, 750, 1000, 1200]
    params['ada_boost__learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1]
    params['ada_boost__algorithm'] = ['SAMME.R']



### Calling GridSearch

In [25]:
grid = GridSearchCV(pipe, params, cv=10, scoring='roc_auc', n_jobs=20)
grid.fit(X,y);

### Displaying results

In [26]:
print(f'Score: {grid.best_score_}')
print(f'Best params: {grid.best_params_}')
estimator = grid.best_estimator_[model_for_testing]

grid_best_params = str(grid.best_params_)
grid_best_params
gridcv_best_score = grid.best_score_

Score: 0.6818214990911347
Best params: {}


### Fitting the pipeline, with the tuned model

In [None]:
pipeline = make_pipeline(col_transform, 
            # pca, 
            estimator)
pipeline.fit(X, y)


# Save the fitted pipeline to a joblib file

In [None]:
from joblib import dump, load
from pathlib import Path

filename = Path('Joblibs/' + dt.date.today().isoformat() + '_' + model_for_testing + '_Feat_sel.joblib')
dump(pipeline, filename)


# Testing

### Get the testing data first

In [None]:
df_testing_data = pd.read_csv('Resources/Testing_data.csv', index_col=0, infer_datetime_format=True)
df_testing_data.index.set_names('Date', inplace=True)

df_testing_data.head(2)

### Setting target values

In [None]:
df_testing_data['Target_returns'] = df_testing_data.Returns.shift(-1)
df_testing_data.dropna(inplace=True)
df_testing_data['Buy_or_sell'] = df_testing_data.Target_returns.apply(lambda x: 1 if x > 0 else 0)
df_testing_data.shape

### Setting a separate currency list for testing

In [None]:
curr_list_test = curr_list

In [None]:
df_testing_subset = df_testing_data.loc[ df_testing_data.Currency.isin(curr_list_test) ] 
X_test = df_testing_subset.loc[: , all_inds].reset_index(drop=True)   
y_test = df_testing_subset.loc[:, ['Target_returns', 'Buy_or_sell']].copy()

print(f'{X_test.shape}; {y_test.shape}')

## Load the joblib file

In [None]:
pipeline = load(filename)

In [None]:
df_pred = y_test
df_pred['Pred_buy_or_sell'] = pipeline.predict(X_test)
df_pred.head()

In [None]:
from sklearn.metrics import classification_report
from imblearn.metrics import classification_report_imbalanced

y_pred = pipeline.predict(X_test)
df_predictions = pd.DataFrame(y_pred, columns=['Buy'])

print(classification_report(y_test.Buy_or_sell, y_pred))

### Getting the total returns when the strategy gave a buy signal 

In [None]:
total_returns_pred_buy = df_pred.loc[df_pred.Pred_buy_or_sell == 1].copy()
total_pnl = total_returns_pred_buy.Target_returns.sum()
total_pnl

# Writing the outcomes to a CSV

In [None]:
currency = ' '.join(curr_list)
indicators = ','.join(all_inds)
model_tested = model[0]
cross_val_roc_auc
cross_val_accuracy
total_pnl
grid_best_params
gridcv_best_score

In [None]:
df_outcomes = pd.read_csv('Resources/GridSearch_test_outcomes.csv', index_col=0)

df_outcomes
# df_outcomes = df_outcomes.append()
df_outcomes = df_outcomes.append(pd.Series([ 
    currency, 
    indicators, 
    model_tested, 
    cross_val_roc_auc, 
    cross_val_accuracy, 
    gridcv_best_score, 
    grid_best_params,
    total_pnl
], index= df_outcomes.columns), ignore_index= True)
df_outcomes.to_csv('Resources/GridSearch_test_outcomes.csv')