# Data preparation for training Machine Learning Models 

* In this notebook we will be using the price + indicator data which has been prepared in the previous notebook. We will set the features columns and the target data column. Then we will set aside a part of the data for testing.

* We will use the GridSearchCV method of the scikit-learn library and check which model is giving the best score for training and validation

In [95]:
import pandas as pd

### First read the data which has been prepared in the previous notebook 

In [96]:
df_data = pd.read_csv('Resources/Data_plus_indicators.csv', index_col='Date', infer_datetime_format=True)
df_data.rename(columns={'Daily_returns': 'Returns'}, inplace= True)
df_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_agg,RSI_ratio,CCI,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,Currency,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-07-27 00:00:00,3025.84,3039.75,3000.0,3033.37,15.40624,0.966833,0.829258,-50.812077,45.004645,0.0,1.223889,1.006075,1.005453,ETH/AUD,0.002949
2021-07-27 01:00:00,3004.4,3004.4,2942.2,2948.3,15.514904,0.964578,0.56238,-166.666667,47.33699,0.0,1.322141,1.03173,0.989691,ETH/AUD,-0.028045
2021-07-27 02:00:00,2953.44,2953.45,2943.87,2943.87,3.495522,0.964405,0.552283,-100.810913,49.202866,0.0,1.162226,1.030266,0.996393,ETH/AUD,-0.001503
2021-07-27 03:00:00,2928.65,2965.45,2928.65,2958.96,9.660272,0.965454,0.858802,-61.607768,51.625298,0.0,1.116467,1.020858,1.005456,ETH/AUD,0.005126
2021-07-27 04:00:00,2939.79,2964.14,2939.79,2962.08,0.165321,0.965746,0.918802,-39.457237,53.563243,0.0,1.04036,1.013488,1.008677,ETH/AUD,0.001054


### Our target value needs to be the returns of the next timeperiod, so transforming the data accordingly

In [97]:
df_data['Target_returns'] = df_data.Returns.shift(-1)
df_data.dropna(inplace=True)
df_data['Buy_or_sell'] = df_data.Target_returns.apply(lambda x: 1 if x > 0 else -1)
df_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_agg,RSI_ratio,CCI,ADX,ADX_dirn,ATR_ratio,BBands_high,BBands_low,Currency,Returns,Target_returns,Buy_or_sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-07-27 00:00:00,3025.84,3039.75,3000.0,3033.37,15.40624,0.966833,0.829258,-50.812077,45.004645,0.0,1.223889,1.006075,1.005453,ETH/AUD,0.002949,-0.028045,-1
2021-07-27 01:00:00,3004.4,3004.4,2942.2,2948.3,15.514904,0.964578,0.56238,-166.666667,47.33699,0.0,1.322141,1.03173,0.989691,ETH/AUD,-0.028045,-0.001503,-1
2021-07-27 02:00:00,2953.44,2953.45,2943.87,2943.87,3.495522,0.964405,0.552283,-100.810913,49.202866,0.0,1.162226,1.030266,0.996393,ETH/AUD,-0.001503,0.005126,1
2021-07-27 03:00:00,2928.65,2965.45,2928.65,2958.96,9.660272,0.965454,0.858802,-61.607768,51.625298,0.0,1.116467,1.020858,1.005456,ETH/AUD,0.005126,0.001054,1
2021-07-27 04:00:00,2939.79,2964.14,2939.79,2962.08,0.165321,0.965746,0.918802,-39.457237,53.563243,0.0,1.04036,1.013488,1.008677,ETH/AUD,0.001054,0.003494,1


In [98]:
df_data.Currency.unique()

array(['ETH/AUD', 'XRP/AUD', 'LTC/AUD', 'ADA/AUD', 'XLM/AUD', 'BCH/USD'],
      dtype=object)

# Preparing the data for training the classifier models

In [99]:
curr_list = ['XLM/AUD', 'LTC/AUD', 'XRP/AUD', 'ETH/AUD', 'BCH/USD'] #, 'LTC/AUD'
df_filtered = df_data.loc[ df_data.Currency.isin(curr_list) ]
df_filtered.shape

(3501, 17)

In [100]:
X = df_filtered.loc[:,['SMA_agg', 'RSI_ratio', 'ADX_dirn', 'ATR_ratio', 'BBands_high', 'BBands_low']].reset_index(drop=True)        # , 'CCI', 'Returns'
y = df_filtered.Buy_or_sell

In [101]:
y.value_counts()


-1    2203
 1    1298
Name: Buy_or_sell, dtype: int64

In [102]:
from imblearn.over_sampling import SMOTE
resampler = SMOTE(random_state= 1)
X , y = resampler.fit_resample(X , y)
X.shape


(4406, 6)

In [103]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

dec_tree = DecisionTreeClassifier()
logreg = LogisticRegression( solver='liblinear')
forest = RandomForestClassifier( criterion='gini')
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()

In [104]:
from sklearn.model_selection import cross_val_score

## Cross-validate a Pipeline with 1 feature

In [105]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA

In [106]:
col_transform = make_column_transformer(
    (StandardScaler(), X.columns.to_list())
)
col_transform.fit_transform(X);

pca = PCA(n_components=3)

In [107]:
# chain sequential steps together
from sklearn.pipeline import make_pipeline, Pipeline
# pipe = make_pipeline(col_transform, dec_tree)
pipe = Pipeline(steps= [('col_transform', col_transform), 
                        ('pca', pca),
                        # ('dec_tree', dec_tree)
                        # ('forest', forest)
                        # ('grad_boost', grad_boost)
                        ('ada_boost', ada_boost)
                    ])

# (col_transform, dec_tree)

In [108]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()


0.5678617810760669

# Attempt 1: GridSearch using 1 classifier at a time

In [109]:
from sklearn.model_selection import GridSearchCV

In [110]:
params = {}
# params['logisticregression__C'] = [0.05, 0.06, 0.07]
# params['logisticregression__penalty'] = ['l1']

# params['dec_tree__criterion'] = ['gini']

# params['forest__n_estimators'] = list(range(100,150,10))
# params['forest__max_depth'] = list(range(3,6,1))
# params['forest__max_features'] = list(range(2,4,1))

# params['grad_boost__learning_rate'] = [0.1, 0.3, 0.5]
# params['grad_boost__n_estimators'] = list(range(100,200,10))
# params['grad_boost__max_features'] = ['auto', 'sqrt', 'log2']
# params['grad_boost__max_depth'] = list(range(3,8,1))
# params['grad_boost__loss'] = ['deviance', 'exponential']

params['ada_boost__n_estimators'] = list(range(120,160,10))
params['ada_boost__learning_rate'] = [ 0.25, 0.5, 0.75]
# params['ada_boost__algorithm'] = ['SAMME', 'SAMME.R']

In [111]:
grid = GridSearchCV(pipe, params, cv=10, scoring='accuracy')
grid.fit(X,y);

In [112]:
print(f'Score: {grid.best_score_}')
print(f'Best params: {grid.best_params_}')
estimator = grid.best_estimator_['ada_boost']
estimator

Score: 0.577166563595135
Best params: {'ada_boost__learning_rate': 0.5, 'ada_boost__n_estimators': 130}


AdaBoostClassifier(learning_rate=0.5, n_estimators=130)

In [113]:
estimator.feature_importances_

array([0.4       , 0.30769231, 0.29230769])

In [114]:
# Testing
df_testing = df_data.loc[ df_data.Currency == 'ADA/AUD' ]
X_test = df_testing.loc[: , ['SMA_agg', 'RSI_ratio', 'ADX_dirn', 'ATR_ratio', 'BBands_high', 'BBands_low']].reset_index(drop=True)   # , 'CCI', 'Returns'
y_test = df_testing.Buy_or_sell

print(f'{X_test.shape}; {y_test.shape}')

(705, 6); (705,)


In [115]:
pipeline = make_pipeline(col_transform, pca, estimator)
pipeline.fit(X, y)
df_pred = pd.DataFrame(pipeline.predict(X_test))
df_pred.value_counts()

 1    369
-1    336
dtype: int64

In [116]:
# pipeline.predict(X_test)

In [117]:
from sklearn.metrics import classification_report
df_predictions = pd.DataFrame(pipeline.predict(X_test), columns=['Buy'])
report = classification_report(y_test, pipeline.predict(X_test), output_dict=True)
report

{'-1': {'precision': 0.6636904761904762,
  'recall': 0.5022522522522522,
  'f1-score': 0.5717948717948718,
  'support': 444},
 '1': {'precision': 0.4010840108401084,
  'recall': 0.5670498084291188,
  'f1-score': 0.46984126984126984,
  'support': 261},
 'accuracy': 0.526241134751773,
 'macro avg': {'precision': 0.5323872435152923,
  'recall': 0.5346510303406855,
  'f1-score': 0.5208180708180707,
  'support': 705},
 'weighted avg': {'precision': 0.5664702102948082,
  'recall': 0.526241134751773,
  'f1-score': 0.5340503468163043,
  'support': 705}}

# Attempt 2 - Multiple Classifiers in a single GridSearch Call

In [1]:
from CronJobs import predictions as pr

result = pr.getData()

print(result)

RuntimeError: This event loop is already running