# Linear Discriminant Analysis

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2018-08-27'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.95,3.95,3.98,3.84,3.85,20548400
2014-01-03,4.0,4.0,4.0,3.88,3.98,22887200
2014-01-06,4.13,4.13,4.18,3.99,4.01,42398300
2014-01-07,4.18,4.18,4.25,4.11,4.19,42932100
2014-01-08,4.18,4.18,4.26,4.14,4.23,30678700


In [3]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()

In [4]:
# Define X
X = np.array(dataset[['Open', 'High', 'Low', 'Adj Close', 'Volume']])
X[0:5]

array([[3.98000002e+00, 4.00000000e+00, 3.88000011e+00, 4.00000000e+00,
        2.28872000e+07],
       [4.01000023e+00, 4.17999983e+00, 3.99000001e+00, 4.13000011e+00,
        4.23983000e+07],
       [4.19000006e+00, 4.25000000e+00, 4.11000013e+00, 4.17999983e+00,
        4.29321000e+07],
       [4.23000002e+00, 4.26000023e+00, 4.13999987e+00, 4.17999983e+00,
        3.06787000e+07],
       [4.19999981e+00, 4.23000002e+00, 4.05000019e+00, 4.09000015e+00,
        3.06676000e+07]])

In [5]:
# Define y
y = np.array(dataset['Buy_Sell'])
y[0:5]

array([1, 1, 0, 0, 1])

In [6]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-0.62250216, -0.63550706, -0.62613809, -0.61792054, -0.43967264],
       [-0.61632107, -0.59915373, -0.60299004, -0.59116896,  0.14205802],
       [-0.5792348 , -0.58501628, -0.57773758, -0.58087996,  0.15797346],
       [-0.57099341, -0.58299661, -0.57142453, -0.58087996, -0.20736619],
       [-0.5771745 , -0.58905554, -0.59036379, -0.5994002 , -0.20769714]])

In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (877, 5) (877,)
Test set: (293, 5) (293,)


In [8]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [9]:
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [10]:
LDA.coef_

array([[ 3.91181596, -0.81842302, -1.67178807, -1.27620095, -0.06718894]])

In [11]:
y_pred = LDA.predict(X_test)

In [12]:
from sklearn.metrics import mean_squared_error
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

The rmse of prediction is: 0.6961620775546855


In [13]:
LDA.score(X_test, y_test)

0.515358361774744