# 4.6.3 Linear Discriminant Analysis

Load modules and data

In [42]:
# %load dss_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors

from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
%matplotlib inline
plt.style.use('seaborn-white')
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


In [43]:
Smarket = pd.read_csv('DataSets/Smarket.csv', usecols = range(1,10),parse_dates=True)

In this lab we will perform LDA on the Smarket data. In Python, we can fit a LDA model using the LinearDiscriminantAnalysis function, which is part of the sklearn library. First we fit the model using only the observations before 2005.

In [44]:
x_train = Smarket[0:sum(Smarket.Year<2005)][['Lag1','Lag2']]
y_train = Smarket[0:sum(Smarket.Year<2005)]['Direction']

lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(x_train, y_train);


## Prior probabilities of groups:

In [45]:
print("Down: %f" % lda.priors_[0])
print("Up: %f" % lda.priors_[1])

Down: 0.491984
Up: 0.508016


The LDA output indicates prior probabilities of ${\hat{\pi}}_1 = 0.492$ and ${\hat{\pi}}_2 = 0.508$; in other words, 49.2% of the training observations correspond to days during which the market went down.

## Group means:

In [46]:
pd.DataFrame(lda.means_,['Down', 'Up'],['Lag1','Lag2'])

Unnamed: 0,Lag1,Lag2
Down,0.04279,0.033894
Up,-0.039546,-0.031325


The group means provides the average of each predictor within each class, and are used by LDA as estimates of $\mu_k$. These suggest that there is a tendency for the previous 2 days’ returns to be negative on days when the market increases, and a tendency for the previous days’ returns to be positive on days when the market declines. 

## Coefficients of linear discriminants:

In [47]:


pd.DataFrame(lda.scalings_,['Lag1', 'Lag2'],['LD'])

Unnamed: 0,LD
Lag1,-0.642019
Lag2,-0.513529


The coefficients of linear discriminants output provides the linear combination of Lag1 and Lag2 that are used to form the LDA decision rule. If $−0.0554 \cdot Lag1 − 0.0443 \cdot Lag2$ is large, then the LDA classifier will predict a market increase, and if it is small, then the LDA classifier will predict a market decline. Note: these coefficients differ from those produced by R.

The predict() function returns a list of LDA’s predictions about the movement of the market on the train data:

In [48]:
y_pred = lda.predict(x_train)

In [49]:
pd.DataFrame(confusion_matrix(y_train, y_pred).T,['Down', 'Up'],['Down','Up'])

Unnamed: 0,Down,Up
Down,168,160
Up,323,347


In [51]:
from collections import defaultdict

def report2dict(cr):
    # Parse rows
    tmp = list()
    for row in cr.split("\n"):
        parsed_row = [x for x in row.split("  ") if len(x) > 0]
        if len(parsed_row) > 0:
            tmp.append(parsed_row)
    
    # Store in dictionary
    measures = tmp[0]

    D_class_data = defaultdict(dict)
    for row in tmp[1:]:
        class_label = row[0]
        for j, m in enumerate(measures):
            D_class_data[class_label][m.strip()] = float(row[j + 1].strip())
    return D_class_data



classificationReport=classification_report(y_train, y_pred, digits=3)

pd.DataFrame(report2dict(classificationReport)).T

Unnamed: 0,f1-score,precision,recall,support
Down,0.41,0.512,0.342,491.0
Up,0.59,0.518,0.684,507.0
avg / total,0.501,0.515,0.516,998.0


The predict() function returns a list of LDA’s predictions about the movement of the market on the test data:

In [52]:
x_test = Smarket[sum(Smarket.Year<2005):][['Lag1','Lag2']] # Data from 2005
y_test = Smarket[sum(Smarket.Year<2005):]['Direction'] # Data from 2005
y_pred = lda.predict(x_test)


The model assigned 70 observations to the “Down” class, and 182 observations to the “Up” class. Let’s check out the confusion matrix to see how this model is doing. We’ll want to compare the predicted class to the true class.

In [53]:
pd.DataFrame(confusion_matrix(y_test, y_pred).T,['Down', 'Up'],['Down','Up'])

Unnamed: 0,Down,Up
Down,35,35
Up,76,106


Comparing with 4.6.2 The LDA and logistic regression predictions are almost identical.

The classification report is shown in the following table:

In [54]:
classificationReport=classification_report(y_test, y_pred, digits=3)

The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:
F1 = 2 * (precision * recall) / (precision + recall)

In [10]:
pd.DataFrame(report2dict(classificationReport)).T

Unnamed: 0,f1-score,precision,recall,support
Down,0.387,0.5,0.315,111.0
Up,0.656,0.582,0.752,141.0
avg / total,0.538,0.546,0.56,252.0


Applying a 50% threshold to the posterior probabilities allows us to recreate the predictions

In [11]:
pred_p = lda.predict_proba(x_test)


In [12]:
print(sum(pred_p[:,0]>=0.5))
print(sum(pred_p[:,0]<0.5))

70
182


Notice that the posterior probability output by the model corresponds to the probability that the market will $\underline{decrease}$

In [13]:
pd.DataFrame(pred_p[10:20,0],y_pred[10:20],).T

Unnamed: 0,Up,Down,Up.1,Up.2,Up.3,Up.4,Up.5,Down.1,Up.6,Up.7
0,0.490696,0.511999,0.489515,0.470676,0.474459,0.479958,0.493578,0.503089,0.497881,0.488633


If we wanted to use a posterior probability threshold other than 50% in order to make predictions, then we could easily do so. For instance, suppose that we wish to predict a market decrease only if we are very certain that the market will indeed decrease on that day—say, if the posterior probability is at least 90%:

In [14]:
print(sum(pred_p[:,0]>0.9))

0


No days in 2005 meet that threshold! In fact, the greatest posterior probability of decrease in all of 2005 was 52.02%:

In [15]:
max(pred_p[:,0])

0.52023495053561553

# Prediction with lag1, lag2 and Volume


In [16]:
x_train = Smarket[0:sum(Smarket.Year<2005)][['Lag1','Lag2','Volume']]
y_train = Smarket[0:sum(Smarket.Year<2005)]['Direction']

lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(x_train, y_train);

## Group means:

In [17]:
pd.DataFrame(lda.means_,['Down', 'Up'],['Lag1','Lag2','Volume'])

Unnamed: 0,Lag1,Lag2,Volume
Down,0.04279,0.033894,1.371843
Up,-0.039546,-0.031325,1.36321


## Coefficients of linear discriminants:

In [18]:
pd.DataFrame(lda.scalings_,['Lag1', 'Lag2','Volume'],['LD'])

Unnamed: 0,LD
Lag1,-0.586882
Lag2,-0.498738
Volume,-1.301667


The predict() function returns a list of LDA’s predictions about the movement of the market on the test data:

In [19]:
x_test = Smarket[sum(Smarket.Year<2005):][['Lag1','Lag2','Volume']] # Data from 2005
y_test = Smarket[sum(Smarket.Year<2005):]['Direction'] # Data from 2005
predict = lda.predict(x_test)


In [20]:
pd.DataFrame(confusion_matrix(y_test, predict).T,['Down', 'Up'],['Down','Up'])

Unnamed: 0,Down,Up
Down,79,100
Up,32,41


In [21]:
classificationReport=classification_report(y_test, predict, digits=3)
pd.DataFrame(report2dict(classificationReport)).T

Unnamed: 0,f1-score,precision,recall,support
Down,0.545,0.441,0.712,111.0
Up,0.383,0.562,0.291,141.0
avg / total,0.454,0.509,0.476,252.0


# Prediction with lag1, lag2 and lag3

In [22]:
x_train = Smarket[0:sum(Smarket.Year<2005)][['Lag1','Lag2','Lag3']]
y_train = Smarket[0:sum(Smarket.Year<2005)]['Direction']

lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(x_train, y_train);

## Group means:

In [23]:
pd.DataFrame(lda.means_,['Down', 'Up'],['Lag1','Lag2','Volume'])

Unnamed: 0,Lag1,Lag2,Volume
Down,0.04279,0.033894,-0.009807
Up,-0.039546,-0.031325,0.005834


## Coefficients of linear discriminants:

In [24]:
pd.DataFrame(lda.coef_,['LD'],['Lag1', 'Lag2','Lag3'])

Unnamed: 0,Lag1,Lag2,Lag3
LD,-0.055345,-0.044161,0.0088


In [25]:
x_test = Smarket[sum(Smarket.Year<2005):][['Lag1','Lag2','Lag3']] # Data from 2005
y_test = Smarket[sum(Smarket.Year<2005):]['Direction'] # Data from 2005
predict = lda.predict(x_test)

In [26]:
pd.DataFrame(lda.scalings_,['Lag1', 'Lag2','Lag3'],['LD'])

Unnamed: 0,LD
Lag1,-0.635907
Lag2,-0.507403
Lag3,0.101117


In [27]:
classificationReport=classification_report(y_test, predict, digits=3)
pd.DataFrame(report2dict(classificationReport)).T

Unnamed: 0,f1-score,precision,recall,support
Down,0.422,0.551,0.342,111.0
Up,0.679,0.601,0.78,141.0
avg / total,0.566,0.579,0.587,252.0
