# Data preparation for training Machine Learning Models 

* In this notebook we will be using the price + indicator data which has been prepared in the previous notebook. We will set the features columns and the target data column. Then we will set aside a part of the data for testing.

* We will use the GridSearchCV method of the scikit-learn library and check which model is giving the best score for training and validation

In [None]:
import pandas as pd

### First read the data which has been prepared in the previous notebook 

In [None]:
df_data = pd.read_csv('Resources/Data_plus_indicators.csv', index_col='Date', infer_datetime_format=True)
df_data.rename(columns={'Daily_returns': 'Returns'}, inplace= True)
df_data.head()

### Our target value needs to be the returns of the next timeperiod, so transforming the data accordingly

In [None]:
df_data['Target_returns'] = df_data.Returns.shift(-1)
df_data.dropna(inplace=True)
df_data['Buy_or_sell'] = df_data.Target_returns.apply(lambda x: 1 if x > 0 else -1)
df_data.head()

In [None]:
df_data.Currency.unique()

# Preparing the data for training the classifier models

In [None]:
curr_list = ['XLM/AUD', 'LTC/AUD', 'XRP/AUD', 'ETH/AUD', 'BCH/USD'] #, 'LTC/AUD'
df_filtered = df_data.loc[ df_data.Currency.isin(curr_list) ]
df_filtered.shape

In [None]:
X = df_filtered.loc[:,['SMA_agg', 'RSI_ratio', 'ADX_dirn', 'ATR_ratio', 'BBands_high', 'BBands_low']].reset_index(drop=True)        # , 'CCI', 'Returns'
y = df_filtered.Buy_or_sell

In [None]:
y.value_counts()


In [None]:
from imblearn.over_sampling import SMOTE
resampler = SMOTE(random_state= 1)
X , y = resampler.fit_resample(X , y)
X.shape


In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

dec_tree = DecisionTreeClassifier()
logreg = LogisticRegression( solver='liblinear')
forest = RandomForestClassifier( criterion='gini')
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()

In [None]:
from sklearn.model_selection import cross_val_score

## Cross-validate a Pipeline with 1 feature

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA

In [None]:
col_transform = make_column_transformer(
    (StandardScaler(), X.columns.to_list())
)
col_transform.fit_transform(X);

pca = PCA(n_components=3)

In [None]:
# chain sequential steps together
from sklearn.pipeline import make_pipeline, Pipeline
# pipe = make_pipeline(col_transform, dec_tree)
pipe = Pipeline(steps= [('col_transform', col_transform), 
                        ('pca', pca),
                        # ('dec_tree', dec_tree)
                        # ('forest', forest)
                        # ('grad_boost', grad_boost)
                        ('ada_boost', ada_boost)
                    ])

# (col_transform, dec_tree)

In [None]:
# cross-validate the entire process
# thus, preprocessing occurs within each fold of cross-validation
cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()


# Attempt 1: GridSearch using 1 classifier at a time

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {}
# params['logisticregression__C'] = [0.05, 0.06, 0.07]
# params['logisticregression__penalty'] = ['l1']

# params['dec_tree__criterion'] = ['gini']

# params['forest__n_estimators'] = list(range(100,150,10))
# params['forest__max_depth'] = list(range(3,6,1))
# params['forest__max_features'] = list(range(2,4,1))

# params['grad_boost__learning_rate'] = [0.1, 0.3, 0.5]
# params['grad_boost__n_estimators'] = list(range(100,200,10))
# params['grad_boost__max_features'] = ['auto', 'sqrt', 'log2']
# params['grad_boost__max_depth'] = list(range(3,8,1))
# params['grad_boost__loss'] = ['deviance', 'exponential']

params['ada_boost__n_estimators'] = list(range(120,160,10))
params['ada_boost__learning_rate'] = [ 0.25, 0.5, 0.75]
# params['ada_boost__algorithm'] = ['SAMME', 'SAMME.R']

In [None]:
grid = GridSearchCV(pipe, params, cv=10, scoring='accuracy')
grid.fit(X,y);

In [None]:
print(f'Score: {grid.best_score_}')
print(f'Best params: {grid.best_params_}')
estimator = grid.best_estimator_['ada_boost']
estimator

In [None]:
estimator.feature_importances_

In [None]:
# Testing
df_testing = df_data.loc[ df_data.Currency == 'ADA/AUD' ]
X_test = df_testing.loc[: , ['SMA_agg', 'RSI_ratio', 'ADX_dirn', 'ATR_ratio', 'BBands_high', 'BBands_low']].reset_index(drop=True)   # , 'CCI', 'Returns'
y_test = df_testing.Buy_or_sell

print(f'{X_test.shape}; {y_test.shape}')

In [None]:
pipeline = make_pipeline(col_transform, pca, estimator)
pipeline.fit(X, y)
df_pred = pd.DataFrame(pipeline.predict(X_test))
df_pred.value_counts()

In [None]:
# pipeline.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
df_predictions = pd.DataFrame(pipeline.predict(X_test), columns=['Buy'])
report = classification_report(y_test, pipeline.predict(X_test), output_dict=True)
report

# Attempt 2 - Multiple Classifiers in a single GridSearch Call

In [2]:
from CronJobs import predictions as pr
# import nest_asyncio
# nest_asyncio.apply()

print(pr.printcwd())

# print(df)
# df.head(5)
# print(len(result))

/Users/hemanglunagaria/Documents/Monash_FinTech_repos/project_2_ml_trading/Data_ML_models_training


In [None]:
result[1]

In [None]:
from CronJobs import predictions as pr

pr.getOHLCData_sync()

In [None]:
from CronJobs import predictions as pr

pr.predictions()

*/2 * * * * /Users/hemanglunagaria/Documents/Monash_FinTech_repos/project_2_ml_trading/cron_job_script > /tmp/stdout.log 2> /tmp/stderr.log