In [176]:
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 11 13:25:40 2022

@author: Malhar
Logisitic regression of variables affecting Buy and Sell Outcomes. Rather than make each outcome a catagorical, I chose to run
each regression buy/sell seperately to acheive better fits

"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
#we use the imbalanced learn library to correct imbalanced data
from imblearn.over_sampling import RandomOverSampler


In [177]:

#import the data we cleaned in the previous notebook
df = pd.read_csv("D:\\Malhar\\Anaconda3\\Malhar\\Data\\EUR USD 1H Data Preparation\\EURUSD1H_Cleaned.csv")
#Some Simple metrics on how many True/False values occur for Buy/Sell
print(df['Buy'].value_counts())
print(df['Sell'].value_counts())
print(df.shape)
#Shows Buy Signal 15.72% of the time, Sell Signal 15.54% of the time. We will need to balance the data

False    99088
True     18482
Name: Buy, dtype: int64
False    99299
True     18271
Name: Sell, dtype: int64
(117570, 52)


In [178]:
#Convert the Buy/Sell columns to numbers, since these are not scaled
df['Buy'] = df['Buy'] * 2
df['Sell'] = df['Sell'] * 1
df['y'] = df['Sell'] + df['Buy']
df.drop(columns=['Buy', 'Sell'], inplace = True)
#There are a lot of columns, so making a list to slice the df in the future is quicker and clearer than using [:,-1]
col_x = ['Volume', 'Month', 'Day', 'Hour', 'Change', 'Range', 'MA5', 'MA7', 'MA9', 'MA12', 'MA14', 'MA21', 'MA24', 'MA50', 'MA100', 'MA120', 'MA200', 'Range5', 'Range7', 'Range9', 'Range12', 'Range14', 'Range21', 'Range24', 'Range50', 'Range100', 'Range120', 'Range200', 'HH2', 'LL2', 'HH3', 'LL3', 'HH4', 'LL4', 'HH5', 'LL5', 'HH6', 'LL6', 'HH7', 'LL7', 'HH8', 'LL8', 'HH9', 'LL9', 'HH10', 'LL10', 'HH11', 'LL11', 'HH12', 'LL12']

#x are the independant variables, y is the dependant variable
#b is for a buy outcome, and s for selll outcomes

x = df[col_x]
y = df['y']

In [179]:
#Split test and train data for Buy and Sell using Hold Out method
xtrain, xtest, ytrain, ytest = train_test_split(
		x, y, test_size = 0.25, random_state = 0)

In [180]:
'''
#Scale data and make non numeric data numeric
sc = StandardScaler()
xtrain = sc.fit_transform(xtrain)
xtest = sc.transform(xtest)
xtrain= xtrain.reshape(-1, 1)
ytrain= ytrain.reshape(-1, 1)
xtest = xtest.reshape(-1, 1)
#xtrain =pd.DataFrame(data = xtrain, columns = col_x)
#xtest =pd.DataFrame(data = xtest, columns = col_x)
#print(ytrain['y'])
'''

"\n#Scale data and make non numeric data numeric\nsc = StandardScaler()\nxtrain = sc.fit_transform(xtrain)\nxtest = sc.transform(xtest)\nxtrain= xtrain.reshape(-1, 1)\nytrain= ytrain.reshape(-1, 1)\nxtest = xtest.reshape(-1, 1)\n#xtrain =pd.DataFrame(data = xtrain, columns = col_x)\n#xtest =pd.DataFrame(data = xtest, columns = col_x)\n#print(ytrain['y'])\n"

In [181]:
'''
#Resample to balance, using unbalanced learn, Random Oversampler algorithim [less computationally intensive than SMOTE]
xtrain, ytrain = RandomOverSampler().fit_resample(xtrain, ytrain)
'''

'\n#Resample to balance, using unbalanced learn, Random Oversampler algorithim [less computationally intensive than SMOTE]\nxtrain, ytrain = RandomOverSampler().fit_resample(xtrain, ytrain)\n'

In [182]:
#tn, fp, fn, tp = confusion_matrix(y_actual, y_predict, labels=[0,1]).ravel()
clf = LogisticRegression(max_iter = 1000000, random_state = 0)
for value in col_x:
    clf = LogisticRegression(max_iter = 1000000, random_state = 0)
    clf.fit(xtrain.loc[:,[value]], ytrain)
    y_pred = clf.predict(xtest.loc[:,[value]])
    a,b,c,d,e,f,g,h,i = confusion_matrix(ytest,y_pred, labels=[0,1,2]).ravel()
    try:
        p1 = e/(e+b+h)
    except ZeroDivisionError:
        p1 = 0
    try:
        p2 = f/(f+c+i)
    except ZeroDivisionError:
        p2 = 0
    p = (0.5*p1)+(0.5*p2)
    print(value)
    print(p)
'''
clf = LogisticRegression(max_iter = 1000000, random_state = 0)
Precision_of_variables = {}
Precision_of_variables2 = {}
Precision_of_variables3 = {}
for value in col_x:
    clf = LogisticRegression(max_iter = 1000000, random_state = 0)
    clf.fit(xtrain.loc[:,[value]], ytrain)
    y_pred = clf.predict(xtest.loc[:,[value]])
    Precision_of_variables[value] = float(precision_score(ytest, y_pred, average='macro', zero_division = 1))
    Precision_of_variables2[value] = float(precision_score(ytest, y_pred, average='micro', zero_division = 1))
    Precision_of_variables3[value] = float(precision_score(ytest, y_pred, average='weighted', zero_division = 1))
Sorted_list = pd.DataFrame.from_dict(Precision_of_variables, orient='index', columns =['Precision']).sort_values(by = 'Precision', ascending = 0)
Sorted_list2 = pd.DataFrame.from_dict(Precision_of_variables2, orient='index', columns =['Precision']).sort_values(by = 'Precision', ascending = 0)
Sorted_list3 = pd.DataFrame.from_dict(Precision_of_variables3, orient='index', columns =['Precision']).sort_values(by = 'Precision', ascending = 0)
print(Sorted_list.head(10))
print(Sorted_list2.head(10))
print(Sorted_list3.head(10))
'''

  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


Volume
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


Month
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


Day
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


Hour
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


Change
nan
Range
0.299652118912081


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA5
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA7
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA9
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA12
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA14
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA21
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA24
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA50
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA100
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA120
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


MA200
nan


  p2 = f/(f+c+i)


Range5
nan


  p2 = f/(f+c+i)


Range7
nan


  p2 = f/(f+c+i)


Range9
nan


  p2 = f/(f+c+i)


Range12
nan


  p2 = f/(f+c+i)


Range14
nan


  p2 = f/(f+c+i)


Range21
nan
Range24
0.33933393339333934
Range50
0.4985315712187959
Range100
0.3533089457931493


  p2 = f/(f+c+i)


Range120
nan


  p2 = f/(f+c+i)


Range200
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH2
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL2
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH3
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL3
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH4
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL4
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH5
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL5
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH6
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL6
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH7
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL7
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH8
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL8
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH9
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL9
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH10
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL10
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH11
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


LL11
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


HH12
nan
LL12
nan


  p1 = e/(e+b+h)
  p2 = f/(f+c+i)


"\nclf = LogisticRegression(max_iter = 1000000, random_state = 0)\nPrecision_of_variables = {}\nPrecision_of_variables2 = {}\nPrecision_of_variables3 = {}\nfor value in col_x:\n    clf = LogisticRegression(max_iter = 1000000, random_state = 0)\n    clf.fit(xtrain.loc[:,[value]], ytrain)\n    y_pred = clf.predict(xtest.loc[:,[value]])\n    Precision_of_variables[value] = float(precision_score(ytest, y_pred, average='macro', zero_division = 1))\n    Precision_of_variables2[value] = float(precision_score(ytest, y_pred, average='micro', zero_division = 1))\n    Precision_of_variables3[value] = float(precision_score(ytest, y_pred, average='weighted', zero_division = 1))\nSorted_list = pd.DataFrame.from_dict(Precision_of_variables, orient='index', columns =['Precision']).sort_values(by = 'Precision', ascending = 0)\nSorted_list2 = pd.DataFrame.from_dict(Precision_of_variables2, orient='index', columns =['Precision']).sort_values(by = 'Precision', ascending = 0)\nSorted_list3 = pd.DataFrame.f

In [183]:
print(int(0/0))

ZeroDivisionError: division by zero

In [124]:
print('Buy accuracy is ' + str(balanced_accuracy_score(ybtest, yb_pred)))
print('Buy report:')
print(classification_report(ybtest, yb_pred))

'''
So the precision is about 26-27% in both cases, and with cross validation with Kfolds rather than
HOCV, this is likely to get worse. It still certainly better than 50-50 [the 15-16% threshold noted above], but
we definately want this to be much better. We could try to scatterchart the dependent variables
and use simple visualisation to manuallly flatten the data by keeping only those with appropriate distributions, 
or we could go back and do further feature engineering, or optimise the regressor based on balancing method.
The best option, by conventional wisdom, is to use a better algorithim. In the next notebook, we'll try a simple decision tree'
'''

NameError: name 'ybtest' is not defined