# Data preparation for training Machine Learning Models 

* In this notebook we will be using the price + indicator data which has been prepared in the previous notebook. We will set the features columns and the target data column. Then we will set aside a part of the data for testing.

* We will use the GridSearchCV method of the scikit-learn library and check which model is giving the best score for training and validation

In [24]:
import pandas as pd

### First read the data which has been prepared in the previous notebook 

In [25]:
df_data = pd.read_csv('Resources/Data_plus_indicators.csv', index_col='Date', infer_datetime_format=True)
df_data.rename(columns={'Daily_returns': 'Returns'}, inplace= True)
df_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_agg,RSI_ratio,CCI,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,Currency,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-07-27 00:00:00,3025.84,3039.75,3000.0,3033.37,15.40624,0.966833,0.829258,-50.812077,45.004645,0.0,1.223889,1.006075,1.005453,ETH/AUD,0.002949
2021-07-27 01:00:00,3004.4,3004.4,2942.2,2948.3,15.514904,0.964578,0.56238,-166.666667,47.33699,0.0,1.322141,1.03173,0.989691,ETH/AUD,-0.028045
2021-07-27 02:00:00,2953.44,2953.45,2943.87,2943.87,3.495522,0.964405,0.552283,-100.810913,49.202866,0.0,1.162226,1.030266,0.996393,ETH/AUD,-0.001503
2021-07-27 03:00:00,2928.65,2965.45,2928.65,2958.96,9.660272,0.965454,0.858802,-61.607768,51.625298,0.0,1.116467,1.020858,1.005456,ETH/AUD,0.005126
2021-07-27 04:00:00,2939.79,2964.14,2939.79,2962.08,0.165321,0.965746,0.918802,-39.457237,53.563243,0.0,1.04036,1.013488,1.008677,ETH/AUD,0.001054


### Our target value needs to be the returns of the next timeperiod, so transforming the data accordingly

In [26]:
df_data['Target_returns'] = df_data.Returns.shift(-1)
df_data.dropna(inplace=True)
df_data['Buy_or_sell'] = df_data.Target_returns.apply(lambda x: 1 if x > 0 else -1)
df_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_agg,RSI_ratio,CCI,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,Currency,Returns,Target_returns,Buy_or_sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-07-27 00:00:00,3025.84,3039.75,3000.0,3033.37,15.40624,0.966833,0.829258,-50.812077,45.004645,0.0,1.223889,1.006075,1.005453,ETH/AUD,0.002949,-0.028045,-1
2021-07-27 01:00:00,3004.4,3004.4,2942.2,2948.3,15.514904,0.964578,0.56238,-166.666667,47.33699,0.0,1.322141,1.03173,0.989691,ETH/AUD,-0.028045,-0.001503,-1
2021-07-27 02:00:00,2953.44,2953.45,2943.87,2943.87,3.495522,0.964405,0.552283,-100.810913,49.202866,0.0,1.162226,1.030266,0.996393,ETH/AUD,-0.001503,0.005126,1
2021-07-27 03:00:00,2928.65,2965.45,2928.65,2958.96,9.660272,0.965454,0.858802,-61.607768,51.625298,0.0,1.116467,1.020858,1.005456,ETH/AUD,0.005126,0.001054,1
2021-07-27 04:00:00,2939.79,2964.14,2939.79,2962.08,0.165321,0.965746,0.918802,-39.457237,53.563243,0.0,1.04036,1.013488,1.008677,ETH/AUD,0.001054,0.003494,1


In [27]:
df_data.Currency.unique()

array(['ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD', 'BCH/USD'],
      dtype=object)

# Preparing the data for training the classifier models

In [28]:
curr_list = ['LTC/AUD', 'XLM/AUD'] #, 'XRP/AUD', 'LTC/AUD'
df_filtered = df_data.loc[ df_data.Currency.isin(curr_list) ]
df_filtered.shape

(1387, 17)

In [29]:
X = df_filtered.loc[:,['SMA_agg', 'RSI_ratio', 'ADX_dirn', 'ATR_ratio', 'BBands_high', 'BBands_low']].reset_index(drop=True)        # , 'CCI', 'Returns'
y = df_filtered.Buy_or_sell

In [30]:
y.value_counts()


-1    1112
 1     275
Name: Buy_or_sell, dtype: int64

In [31]:
from imblearn.over_sampling import SMOTE
resampler = SMOTE(random_state= 1)
X , y = resampler.fit_resample(X , y)
X.shape


(2224, 6)

In [32]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

dec_tree = DecisionTreeClassifier()
logreg = LogisticRegression( solver='liblinear')
forest = RandomForestClassifier( criterion='gini')
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()

In [33]:
from sklearn.model_selection import cross_val_score

## Cross-validate a Pipeline with 1 feature

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA

In [35]:
col_transform = make_column_transformer(
    (StandardScaler(), X.columns.to_list())
)
col_transform.fit_transform(X);

pca = PCA(n_components=3)

In [36]:
# chain sequential steps together
from sklearn.pipeline import make_pipeline, Pipeline
# pipe = make_pipeline(col_transform, dec_tree)
pipe = Pipeline(steps= [('col_transform', col_transform), 
                    ('pca', pca),
                    ('logreg', logreg)
                    # ('dec_tree', dec_tree)
                    # ('forest', forest)
                    # ('grad_boost', grad_boost)
                    # ('ada_boost', ada_boost)
                    ])

# (col_transform, dec_tree)

In [37]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()


0.5328162242960449

# Attempt 1: GridSearch using 1 classifier at a time

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
params = {}
params['logreg__C'] = [0.05, 0.06, 0.07]
params['logreg__penalty'] = ['l1']

# params['dec_tree__criterion'] = ['gini']

# params['forest__n_estimators'] = list(range(100,150,10))
# params['forest__max_depth'] = list(range(3,6,1))
# params['forest__max_features'] = list(range(2,4,1))

# params['grad_boost__learning_rate'] = [0.1, 0.3, 0.5]
# params['grad_boost__n_estimators'] = list(range(100,200,10))
# params['grad_boost__max_features'] = ['auto', 'sqrt', 'log2']
# params['grad_boost__max_depth'] = list(range(3,8,1))
# params['grad_boost__loss'] = ['deviance', 'exponential']

# params['ada_boost__n_estimators'] = list(range(100,200,10))
# params['ada_boost__learning_rate'] = [0.1, 0.5, 1, 2]
# params['ada_boost__algorithm'] = ['SAMME', 'SAMME.R']

In [40]:
grid = GridSearchCV(pipe, params, cv=10, scoring='accuracy')
grid.fit(X,y);

In [41]:
print(f'Score: {grid.best_score_}')
print(f'Best params: {grid.best_params_}')
estimator = grid.best_estimator_['logreg']
estimator

Score: 0.5422575041409122
Best params: {'logreg__C': 0.07, 'logreg__penalty': 'l1'}


LogisticRegression(C=0.07, penalty='l1', solver='liblinear')

In [42]:
# estimator.feature_importances_

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

In [47]:
# Testing
df_testing = df_data.loc[ df_data.Currency == 'ETH/AUD' ]
X_test = df_testing.loc[: , ['SMA_agg', 'RSI_ratio', 'ADX_dirn', 'ATR_ratio', 'BBands_high', 'BBands_low']].reset_index(drop=True)   # , 'CCI', 'Returns'
y_test = df_testing.Buy_or_sell

print(f'{X_test.shape}; {y_test.shape}')

(705, 6); (705,)


In [49]:
pipeline = make_pipeline(col_transform, pca, estimator)
pipeline.fit(X, y)
# pipeline.predict(X_test)
df_pred = pd.DataFrame(pipeline.predict(X_test))
df_pred.value_counts()

-1    377
 1    328
dtype: int64

In [50]:
# pipeline.predict(X_test)

In [52]:
from sklearn.metrics import classification_report, confusion_matrix

df_predictions = pd.DataFrame(pipeline.predict(X_test), columns=['Buy'])
report = classification_report(y_test, pipeline.predict(X_test), output_dict=True)
report

{'-1': {'precision': 0.5039787798408488,
  'recall': 0.5523255813953488,
  'f1-score': 0.5270457697642162,
  'support': 344},
 '1': {'precision': 0.5304878048780488,
  'recall': 0.481994459833795,
  'f1-score': 0.5050798258345428,
  'support': 361},
 'accuracy': 0.5163120567375886,
 'macro avg': {'precision': 0.5172332923594488,
  'recall': 0.5171600206145719,
  'f1-score': 0.5160627977993795,
  'support': 705},
 'weighted avg': {'precision': 0.5175529047180534,
  'recall': 0.5163120567375886,
  'f1-score': 0.5157979601775324,
  'support': 705}}

# Attempt 2 - Multiple Classifiers in a single GridSearch Call