In [1]:
import numpy as np
import pandas as pd
import os
import sys
import scipy
from scipy.linalg import logm
import math
import time
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
os.chdir("C:\\Users\\MoeAn\\Documents\\GitHub\\LinearDiscriminantAnalysis")

In [3]:
forest = pd.read_csv("train.csv")
forest=forest.dropna()
X = forest.drop(["Cover_Type"],axis=1)
Y= forest["Cover_Type"]

In [4]:
class ldaClassifier:
    import numpy as np
    import pandas as pd
    import scipy
    import math
    import time
            
    def __init__(self,x,y):      
        self.X=x
        self.Y=y
        self.x = self.X.reset_index(drop = True)
        self.y = self.Y.reset_index(drop = True)
        self.universalCovMatrixInverse = None
        self.muVectors = []
        self.muTransposeVectors = []
        self.piVector =[]
   

        print("Checking for dependencies....")
        try:
            import numpy as np
            import pandas as pd
            import scipy
            import math
            import time
            print("Dependencies Loaded")
        
        except:
            print("Missing Dependencies!")
            print("All of the following packages must be installed:")
            print("pandas as pd ,numpy as np , scipy, math , time")
    
    def setX(self,x):
        self.X=x
        
    def setY(self,Y):
        self.Y=Y
        
    def getCovMatrix(self,x):
        sigma = x.cov()
        return sigma
    
    def getPiK(self,k,classColumn):
        matchingRowsCount = sum(classColumn==k)
        return matchingRowsCount/len(classColumn)
      
    def getVectorMu(self,x,y,k):
        x = x.reset_index(drop = True)
        y = y.reset_index(drop = True)
        
        classColumnName = y.name
        
        fullData = pd.concat([x,y],axis=1)
   
        vectorOfMeans = []
        for column in x :
            rowsThatMatchK = fullData.loc[fullData[classColumnName]==k]
            ourColumnInRowsThatMatchK = rowsThatMatchK[column]
            vectorOfMeans.append(ourColumnInRowsThatMatchK.mean())
        return np.transpose(np.asarray(vectorOfMeans))

    def getDeltaK(self,row,k,X,Y,sigmaInverse,mu,muTranspose,pi):
           
        Delta = np.transpose(row).dot(sigmaInverse).dot(mu)-\
        0.5*muTranspose.dot(sigmaInverse).dot(muTranspose)+math.log(pi)
        return Delta
    
    def trainLDA(self):
        print("Attempting to load dependencies..")
        try:
            import numpy as np
            import pandas as pd
            import scipy
            import math
            import time
        
        except:
            print("Missing Dependencies!")
            print("All of the following packages must be installed:")
            print("pandas as pd ,numpy as np , scipy, math , time")
        
        print("Dependencies Loaded")
        print("Initializing Classifier...")
        x = self.x
        y = self.y
        
        classes = y.unique()
        print("Generating Inverted Covariance Matrix..")
        self.universalCovMatrixInverse = np.linalg.pinv(self.getCovMatrix(x))
        print("Sigma Generated")
#         predictions = []
#         classes = y.unique()
        
        print("Calculating Class-Specific Mean Vectors..")
        self.muVectors = []
        self.muTransposeVectors = []
        for currentClass in classes:
            thismu = self.getVectorMu(x,y,currentClass)
            thisTransposeMu = np.transpose(thismu)
            print("Mu for class "+str(currentClass)+" is "+str(thismu))
            self.muVectors.append(thismu)
            self.muTransposeVectors.append(thisTransposeMu)
        
        
        print("Calculating pi proportions....")
        self.piVector =[]
        for currentClass in classes:
            thispi = self.getPiK(currentClass,self.Y)
            print("Pi for class "+str(currentClass)+" is "+str(thispi))
            self.piVector.append(thispi)    
    
    
    def predictLDA(self):
        
        x = self.x
        y = self.y 
        print("Classifying...")
        
        predictions = []
        classes = y.unique()
        rowCounter = 0
        
        for row in range(len(x)):
            notificationInterval =int(round(len(x)/10)) 
            if row % notificationInterval == 0:
                print(str(row)+" Out Of "+str(len(x))+" Observations Classified" )
            thisx = np.asarray(x.iloc[row])
            deltasForThisX = []
            deltaCounter = 0
            for delta in classes:                
                deltasForThisX.append(self.getDeltaK(x.iloc[row],delta,x,y,
                                                     self.universalCovMatrixInverse,
                                                     self.muVectors[deltaCounter],
                                                     self.muTransposeVectors[deltaCounter],
                                                     self.piVector[deltaCounter]
                                                    ))
                deltaCounter=deltaCounter+1
                
            predictions.append(classes[np.argmax(deltasForThisX)])
            rowCounter=rowCounter+1
            
        return predictions
    
    def getUniqueClasses(self):
        uniqclasses = [str(i) for i in self.Y.unique]
        return uniqclasses

In [5]:
testlda = ldaClassifier(X,Y)

Checking for dependencies....
Dependencies Loaded


In [6]:
testlda.trainLDA()

Attempting to load dependencies..
Dependencies Loaded
Initializing Classifier...
Generating Inverted Covariance Matrix..
Sigma Generated
Calculating Class-Specific Mean Vectors..
Mu for class 5 is [  6.48680046e+03   2.78680139e+03   1.37992130e+02   1.67245370e+01
   2.08873148e+02   5.08712963e+01   1.32931852e+03   2.23368981e+02
   2.18317130e+02   1.21392593e+02   1.53038889e+03   3.96296296e-01
   0.00000000e+00   6.03703704e-01   0.00000000e+00   0.00000000e+00
   2.82407407e-02   0.00000000e+00   5.97222222e-02   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   2.96296296e-02   7.12962963e-02   0.00000000e+00   1.41203704e-01
   0.00000000e+00   0.00000000e+00   4.16666667e-03   6.06481481e-02
   2.03703704e-02   8.33333333e-03   2.77777778e-03   0.00000000e+00
   0.00000000e+00   7.26851852e-02   5.09259259e-03   0.00000000e+00
   1.29629630e-02   0.00000000e+00   1.38888889e-03   1.17592593e-01
   2.22222222e-01   4.02777778e-02   4.62962

In [7]:
pred3 = testlda.predictLDA()

Classifying...
0 Out Of 15120 Observations Classified
1512 Out Of 15120 Observations Classified
3024 Out Of 15120 Observations Classified
4536 Out Of 15120 Observations Classified
6048 Out Of 15120 Observations Classified
7560 Out Of 15120 Observations Classified
9072 Out Of 15120 Observations Classified
10584 Out Of 15120 Observations Classified
12096 Out Of 15120 Observations Classified
13608 Out Of 15120 Observations Classified


In [11]:
sum(pred3==Y)/len(Y)

0.62936507936507935

In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(X,Y)
pred9 = clf.predict(X)
sum(pred9==Y)/len(Y)



0.65105820105820111

In [13]:
def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)



In [15]:
# sample usage
save_object(testlda, 'LDATrained.pkl')