## Modelling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

In [5]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
!ls

Double_Tap_Feature_Engineering.ipynb gj_30base.csv
Forex_Data_Analysis.ipynb            gj_30minsupres.csv
Forex_Models.ipynb                   gj_4base.csv
README.md                            gj_4hr.csv
forex.html                           gj_cleandraft.csv
gj_30_RSI_exp.csv                    gpbjpy_dataframe_gen.ipynb


In [38]:
#read in the latest data set
df = pd.read_csv("gj_cleandraft.csv")

In [39]:
df.tail()

Unnamed: 0,date,open,high,low,close,vwma,sr_low,sr_high,volume,sent_30,RSI,direction,body_size,top_wick,bottom_wick,trade_class,trade_class_two,day_of_week,day,month,year,time_24h,session,vfive,rel_vol,out_mag,signal,break_level,sent_4h,x_vwma,pos_can,neg_can,prof_loss,det_trade,short_rsi,long_rsi,sr_dist
18740,2020-07-09 09:30:00,135.658,135.92,135.634,135.783,135.487533,132.186,135.505,6420,0.015488,70.570252,long,0.125,0.262,0.024,no_trade,no_trade,Thursday,9,7,2020,09:30:00,morning,7397.8,0.867826,0.0,no_trade,no_trade,0.097192,0,0.0,0.295467,0.0,0,0.0,0.0,1.008955
18741,2020-07-09 10:00:00,135.783,135.808,135.72,135.756,135.510547,132.186,135.505,4877,0.0173,68.244824,short,0.027,0.025,0.063,win,win,Thursday,9,7,2020,10:00:00,morning,7056.2,0.691165,0.13,trade,in_body,0.097192,0,0.245453,0.0,0.13,win,3.24,0.0,0.251
18742,2020-07-09 10:30:00,135.756,135.784,135.661,135.677,135.530777,132.186,135.505,7407,0.016016,61.825399,short,0.079,0.028,0.095,no_trade,no_trade,Thursday,9,7,2020,10:30:00,morning,6260.2,1.183189,0.0,no_trade,no_trade,0.097192,0,0.146223,0.0,0.0,0,0.0,0.0,0.172
18743,2020-07-09 11:00:00,135.677,135.752,135.626,135.662,135.551976,132.186,135.505,6730,0.014076,60.658675,short,0.015,0.075,0.051,no_trade,no_trade,Thursday,9,7,2020,11:00:00,morning,6175.4,1.089808,0.0,no_trade,no_trade,0.097192,0,0.110024,0.0,0.0,0,0.0,0.0,0.157
18744,2020-07-09 11:30:00,135.662,135.761,135.643,135.737,135.573992,132.186,135.505,5069,0.014088,64.287577,long,0.075,0.099,0.019,win,loss,Thursday,9,7,2020,11:30:00,morning,6218.0,0.815214,0.027,trade,in_wick,0.097192,0,0.0,0.163008,-0.027,loss,0.0,0.0,1.008955


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18745 entries, 0 to 18744
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             18745 non-null  object 
 1   open             18745 non-null  float64
 2   high             18745 non-null  float64
 3   low              18745 non-null  float64
 4   close            18745 non-null  float64
 5   vwma             18745 non-null  float64
 6   sr_low           18745 non-null  float64
 7   sr_high          18745 non-null  float64
 8   volume           18745 non-null  int64  
 9   sent_30          18745 non-null  float64
 10  RSI              18745 non-null  float64
 11  direction        18745 non-null  object 
 12  body_size        18745 non-null  float64
 13  top_wick         18745 non-null  float64
 14  bottom_wick      18745 non-null  float64
 15  trade_class      18745 non-null  object 
 16  trade_class_two  18745 non-null  object 
 17  day_of_week 

### We need to decide our training and test features and target 


features =  vwma, sr_low, sr_high, volume, sent_30, body_size, top_wick, bottom_wick, day_of_week(c), day(ss), month(c),
session(c), vfive(ss), rel_vol(Ss), break_level(c), sent_4h(ss), x_vwma(b), pos_can(ss), neg_can(ss), short_rsi(ss)
long_rsi(ss), sr_dist(ss) (22)

target = det_trade

As we have chosen det_trade to be our target variable we'll need to drop all the 0's from this feature. Remember we are trying to predict the outcome of trades signalled by our hypothesis, not whether we should trade or not. 

#### Let's try with a binary classification before getting into the multiclass situation, 

#### We'll make strong win our 1 class and strong loss our 0 class. 

In [41]:
df.det_trade.value_counts()

0              9987
loss           3896
strong win     2742
win            1807
strong loss     313
Name: det_trade, dtype: int64

In [42]:
df1 = df.loc[df["det_trade"]!='0']#as this was a mixed feature type 0 was a string so: "0"

In [43]:
df1 = df1.loc[df["det_trade"]!='loss']

In [44]:
df1 = df1.loc[df["det_trade"]!='win']

In [45]:
#Let's check we now have just two class outcomes
df1.det_trade.value_counts()

strong win     2742
strong loss     313
Name: det_trade, dtype: int64

In [46]:
#Split our independent and dependent features. 
X = df1[["vwma", "volume", "vfive", "rel_vol", "x_vwma", "sr_low", "sr_high", "sr_dist", "sent_30", "sent_4h",
       "body_size", "top_wick", "bottom_wick", "pos_can", "neg_can", "break_level", "day_of_week", "day",
       "month", "session", "short_rsi", "long_rsi"]]

In [47]:
y = df1[["det_trade"]]

In [48]:
#get dummies for the categorical variables
X = pd.get_dummies(X, columns= ["break_level", "day_of_week", "day", "month", "session"], drop_first=True)

In [49]:
X.head(3)

Unnamed: 0,vwma,volume,vfive,rel_vol,x_vwma,sr_low,sr_high,sr_dist,sent_30,sent_4h,body_size,top_wick,bottom_wick,pos_can,neg_can,short_rsi,long_rsi,break_level_in_wick,break_level_out_wick,day_of_week_Monday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,session_evening,session_morning,session_night
2,138.033155,8658,9090.6,0.952412,0,137.268,138.012,0.075,0.028058,0.004876,0.131,0.064,0.194,0.053845,0.0,0.0,0.0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,138.083179,4794,5863.0,0.81767,0,137.268,138.012,0.663,-0.031096,0.037737,0.106,0.001,0.184,0.0,0.152179,0.0,0.0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
12,138.064216,3150,5786.8,0.544342,0,137.268,138.012,0.664,-0.037686,0.065515,0.016,0.055,0.1,0.0,0.132216,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [50]:
X.shape

(3055, 68)

In [71]:
lb = LabelBinarizer()
y =lb.fit_transform(y)
y = pd.DataFrame(y, columns=['det_trade'])
y

Unnamed: 0,det_trade
0,1
1,1
2,1
3,0
4,1
...,...
3050,1
3051,1
3052,1
3053,1


In [21]:
#convert the target feature into a binary class variable
# y.det_trade[y.det_trade=='strong win'] = 1
# y.det_trade[y.det_trade=='strong loss'] = 0

In [72]:
y.head()

Unnamed: 0,det_trade
0,1
1,1
2,1
3,0
4,1


In [73]:
y.shape

(3055, 1)

In [74]:
y.det_trade.value_counts()

1    2742
0     313
Name: det_trade, dtype: int64

In [75]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3055 entries, 0 to 3054
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   det_trade  3055 non-null   int64
dtypes: int64(1)
memory usage: 24.0 KB


## Train, Validation, Test Split

In [76]:
#create training, validation and test data sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.2)

## Train a Basic Model

In [81]:
lr = LogisticRegression()

grid = {'C':10.0**np.arange(-2, 3), 'penalty':['l1', 'l2'], 'solver':['lbfgs', 'liblinear']}
cv = skf

In [82]:
clf = GridSearchCV(lr, param_grid=grid, cv=skf, n_jobs=-1, scoring='f1_macro')

In [88]:
clf.fit(X_train_val, y_train_val)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['lbfgs', 'liblinear']},
             scoring='f1_macro')

In [90]:
y_pred_val = clf.predict(X_test_val)
print(confusion_matrix(y_pred_val, y_test_val))
print(accuracy_score(y_test_val,y_pred_val))

print(classification_report(y_test_val,y_pred_val))

[[  0   0]
 [ 57 432]]
0.8834355828220859
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
           1       0.88      1.00      0.94       432

    accuracy                           0.88       489
   macro avg       0.44      0.50      0.47       489
weighted avg       0.78      0.88      0.83       489



so at this stage the model is essentially predicting all wins :)

true positives - we predicted 0 strong losses that were actually strong losses \
false positives - we predicted 0 strong losses that were actually strong wins \
false negatives - we predicted 57 strong wins that were actually strong losses \
true negatives - we predicted 432 strong wins that were actually strong wins

## Let's try it with a basic Random Forest

In [91]:
rf = RandomForestClassifier()
rf.fit(X_train_val, y_train_val)

RandomForestClassifier()

In [92]:
y_pred_val = rf.predict(X_test_val)
print(confusion_matrix(y_pred_val, y_test_val))
print(accuracy_score(y_test_val,y_pred_val))

print(classification_report(y_test_val,y_pred_val))

[[  0   0]
 [ 57 432]]
0.8834355828220859
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
           1       0.88      1.00      0.94       432

    accuracy                           0.88       489
   macro avg       0.44      0.50      0.47       489
weighted avg       0.78      0.88      0.83       489



In [94]:
##### It's obvious now that the imbalanced data is causing our results to sway heavily to the majority class. 

In [96]:
y_train_val.det_trade.value_counts()

1    1773
0     182
Name: det_trade, dtype: int64

We currently have an approx 1:10 ratio of strong wins to strong losses - we'll need to address this imbalance in the data. We'll try two different methods of oversampling, 1) RandomOverSampling and 2) SmoteTOMEK. 

## SMOTE

In [102]:
from imblearn.combine import SMOTETomek
from collections import Counter

In [105]:
#create an instance of the SMOTE class, including the new ratio of class outcomes
smote = SMOTETomek(0.75) 
X_train_sm, y_train_sm = smote.fit_sample(X_train_val, y_train_val)

print('The number of classes before fit {}'.format(Counter(y_train_val.det_trade)))
print('The number of classes after fit {}'.format(Counter(y_train_sm.det_trade)))


The number of classes before fit Counter({1: 1773, 0: 182})
The number of classes after fit Counter({1: 1577, 0: 1133})


In [106]:
rf_sm = RandomForestClassifier()
rf_sm.fit(X_train_sm, y_train_sm)

RandomForestClassifier()

In [108]:
y_pred_sm = rf_sm.predict(X_test_val)
print(confusion_matrix(y_pred_sm, y_test_val))
print(accuracy_score(y_test_val,y_pred_sm))

print(classification_report(y_test_val,y_pred_val))

[[  1  15]
 [ 56 417]]
0.8548057259713702
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
           1       0.88      1.00      0.94       432

    accuracy                           0.88       489
   macro avg       0.44      0.50      0.47       489
weighted avg       0.78      0.88      0.83       489



so at this stage the model is essentially predicting all wins :)

true positives - we predicted 1 strong losses that were actually strong losses \
false positives - we predicted 15 strong losses that were actually strong wins \
false negatives - we predicted 56 strong wins that were actually strong losses \
true negatives - we predicted 417 strong wins that were actually strong wins

#### So this is a terrible model to present. Let's see if we can gain any further insight through hyper parameter tuning

## Random Search CV

In [112]:
# Let's specify some parameter ranges
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]

#Number of features to consider
max_features = ['auto', 'sqrt', 'log2']

#Max number of levels in each tree
max_depth = [int(x) for x in np.linspace(10,1000,10)]

#Min samples to split a node
min_samples_split = [2, 5, 10, 14]

#Min samples required at each node 
min_samples_leaf = [1, 2, 4, 6, 8]

#Create a grid to store these paramters
random_grid = {'n_estimators': n_estimators, 
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion':['entropy', 'gini']
              }

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [113]:
## Train a random forest classifier and then pass this trained model into a randomsearchCV
rf1 = RandomForestClassifier()
rf1_rand = RandomizedSearchCV(estimator=rf1, param_distributions=random_grid, n_iter=100, cv=3, verbose=5,
                              random_state=42, n_jobs=-1)

In [115]:
rf1_rand.fit(X_train_sm, y_train_sm)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   50.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.1min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=5)

In [116]:
rf1_rand.best_params_

{'n_estimators': 1200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 890,
 'criterion': 'gini'}

In [117]:
rf1_rand

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=5)

In [118]:
rf1_rand.best_estimator_

RandomForestClassifier(max_depth=890, n_estimators=1200)

In [119]:
best_random_grid = rf1_rand.best_estimator_

In [120]:
y_pred = rf1_rand.predict(X_test_val)
print(confusion_matrix(y_pred, y_test_val))
print(accuracy_score(y_test_val,y_pred))

print(classification_report(y_test_val,y_pred))

[[  1  11]
 [ 56 421]]
0.8629856850715747
              precision    recall  f1-score   support

           0       0.08      0.02      0.03        57
           1       0.88      0.97      0.93       432

    accuracy                           0.86       489
   macro avg       0.48      0.50      0.48       489
weighted avg       0.79      0.86      0.82       489

