In [3]:
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 11 13:25:40 2022

@author: Malhar
Logisitic regression of variables affecting Buy and Sell Outcomes. Rather than make each outcome a catagorical, I chose to run
each regression buy/sell seperately to acheive better fits

"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
#we use the imbalanced learn library to correct imbalanced data
from imblearn.over_sampling import RandomOverSampler


In [4]:

#import the data we cleaned in the previous notebook
df = pd.read_csv("EURUSD1H_Cleaned.csv")
#Some Simple metrics on how many True/False values occur for Buy/Sell
print(df['Buy'].value_counts())
print(df['Sell'].value_counts())
print(df.shape)
#Shows Buy Signal 15.72% of the time, Sell Signal 15.54% of the time. We will need to balance the data

False    99088
True     18482
Name: Buy, dtype: int64
False    99299
True     18271
Name: Sell, dtype: int64
(117570, 52)


In [5]:
#Convert the Buy/Sell columns to numbers, since these are not scaled
df[['Buy','Sell']] = df[['Buy','Sell']] * 1

#There are a lot of columns, so making a list to slice the df in the future is quicker
col_x = ['Volume', 'Month', 'Day', 'Hour', 'Change', 'Range', 'MA5', 'MA7', 'MA9', 'MA12', 'MA14', 'MA21', 'MA24', 'MA50', 'MA100', 'MA120', 'MA200', 'Range5', 'Range7', 'Range9', 'Range12', 'Range14', 'Range21', 'Range24', 'Range50', 'Range100', 'Range120', 'Range200', 'HH2', 'LL2', 'HH3', 'LL3', 'HH4', 'LL4', 'HH5', 'LL5', 'HH6', 'LL6', 'HH7', 'LL7', 'HH8', 'LL8', 'HH9', 'LL9', 'HH10', 'LL10', 'HH11', 'LL11', 'HH12', 'LL12']

#x are the independant variables, y is the dependant variable
#b is for a buy outcome, and s for selll outcomes

xb = df[col_x]
xs = df[col_x]
yb = df['Buy']
ys =  df['Sell']

In [6]:
#Split test and train data for Buy and Sell using Hold Out method
xbtrain, xbtest, ybtrain, ybtest = train_test_split(
		xb, yb, test_size = 0.25, random_state = 0)
xstrain, xstest, ystrain, ystest = train_test_split(
		xs, ys, test_size = 0.25, random_state = 0)

In [7]:
#Resample to balance, using unbalanced learn, Random Oversampler algorithim [less computationally intensive than SMOTE]
xbtrain, ybtrain = RandomOverSampler().fit_resample(xbtrain, ybtrain)
xstrain, ystrain = RandomOverSampler().fit_resample(xstrain, ystrain)

In [8]:
#Scale data and make non numeric data numeric
sc_xb = StandardScaler()
xbtrain = sc_xb.fit_transform(xbtrain)
xbtest = sc_xb.transform(xbtest)

sc_xs = StandardScaler()
xstrain = sc_xs.fit_transform(xstrain)
xstest = sc_xs.transform(xstest)

In [9]:
#train logistic regression model for Buy and Sell
classifier_buy = LogisticRegression(max_iter = 1000000, random_state = 0)
classifier_buy.fit(xbtrain, ybtrain)

classifier_sell = LogisticRegression(max_iter = 1000000, random_state = 0)
classifier_sell.fit(xstrain, ystrain)

#Predict values as a series for both models
yb_pred = classifier_buy.predict(xbtest)
ys_pred = classifier_sell.predict(xstest)

#Confusion matrix
cmb = confusion_matrix(ybtest, yb_pred)
cms = confusion_matrix(ystest, ys_pred)

print ("Confusion Matrix : \n", cmb)
print('Precision is ' + str(precision_score(ybtest, yb_pred)))

print ("Confusion Matrix : \n", cms)
print('Precision is ' + str(precision_score(ystest, ys_pred)))

Confusion Matrix : 
 [[17386  7349]
 [ 1995  2663]]
Precision is 0.2659808230123851
Confusion Matrix : 
 [[17469  7308]
 [ 1985  2631]]
Precision is 0.26471476003622096


In [12]:
print('Buy accuracy is ' + str(balanced_accuracy_score(ybtest, yb_pred)))
print('Buy report:')
print(classification_report(ybtest, yb_pred))

'''
So the precision is about 26-27% in both cases, and with cross validation with Kfolds rather than
HOCV, this is likely to get worse. It still certainly better than 50-50 [the 15-16% threshold noted above], but
we definately want this to be much better. We could try to scatterchart the dependent variables
and use simple visualisation to manuallly flatten the data by keeping only those with appropriate distributions, 
or we could go back and do further feature engineering, or optimise the regressor based on balancing method.
The best option, by conventional wisdom, is to use a better algorithim. In the next notebook, we'll try a simple decision tree'
'''

Buy accuracy is 0.6372976175194285
Buy report:
              precision    recall  f1-score   support

           0       0.90      0.70      0.79     24735
           1       0.27      0.57      0.36      4658

    accuracy                           0.68     29393
   macro avg       0.58      0.64      0.58     29393
weighted avg       0.80      0.68      0.72     29393



"\nSo the precision is about 26-27% in both cases, and with cross validation with Kfolds rather than\nHOCV, this is likely to get worse. It still certainly better than 50-50, but\nwe definately want this to be much better. We could try to scatterchart the dependent variables\nand use simple visualisation to manuallly flatten the data, or go back and do furtherr feature engineering\nA better option, is to use a better algorithim. In the next notebook, we'll try a simple decision tree'\n"