In [13]:
# Import necessary libraries
from encodings import search_function
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Import necessary tools from the sklearn library
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold

from scipy.stats import randint as sp_randint
import warnings
warnings.filterwarnings('ignore')

# LDA

In [14]:
# load datasets function
def load_data(data_file_name):
    data_dir = "..\..\..\data\data_classification"
    data_path = os.path.join(data_dir, data_file_name)
    df = pd.read_csv(data_path)
    data_X = df.iloc[:,:-1]
    data_y = df.iloc[:,-1]
    scaler_X = StandardScaler()
    data_X = scaler_X.fit_transform(data_X)
    data_y = pd.Categorical(data_y).codes.reshape(-1)
    return data_X, data_y

In [15]:
def main():
    # read dataset from csv file
    data_name = "abalone_classification"
    data_X, data_y = load_data("{}.csv".format(data_name))

    # Randomly assingning a train and test set
    train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=2200)
    return train_X, test_X, train_y, test_y


In [16]:
class myLDA(object):
    '''
    This class is for linear discriminant analysis classification.
    
    The class contains the parameters of LDA, including the number of classes and the prior probability p(i) of 
    each class $i$, where $i=1,2,\ldots,num_classes$. Moreover, the class contains the the mean vectors $\mu_i$ 
    and covariance matrix $\Sigma$ of probability distributions $p(x|i)$ for the class $i$.
    
    It also contains the functions for initializing the class, fitting the LDA classifier model, use 
    the fitted model to calculate the linear discriminant functions $\delta_i(x)$ and decision function $h^*(x)$.
    
    Attributes:
        mu (matrix, num_classes*num_features)    : mean vectors of distributions $p(x|i)$. The $i$-th row represents $\mu_i$.
        Sigma (matrix, num_features*num_features): covariance matrix
        num_classes (positive integer)           : the number of classes
        priorProbs (vector, num_classes)         : the prior probability vector and its $i$-th element is $p(i)$
        
    '''
    def __init__(self):
        '''
        Initialize the class by just assigning zero to all atrributes. 
        '''
        self.mu = 0 
        self.Sigma = 0
        self.num_classes = 0
        self.priorProbs = 0
        
    def fit(self, X, y):
        '''
        estimate the mean vector and covariance matrix of each class in the LDA model
        
        Args: 
            X (matrix, num_train*num_features): features of training samples
            y (matrix, num_train): label of training samples
            
        Returns:
            mu (matrix, num_classes*num_features)    : mean vectors of distributions $p(x|i)$. The $i$-th row represents $\mu_i$.
            Sigma (matrix, num_features*num_features): covariance matrix
        ''' 
        num_samples, num_features = X.shape
        num_classes = np.max(y)+1
        values = list(range(num_classes))
        counts = np.zeros(num_classes)
        for i in range(num_classes):
            counts[i] = np.count_nonzero(y == i)  
        zero_index_values = [i for i in range(num_classes) if counts[i]<=1]
        ### calculate the prior probability $p(i)$
        self.priorProbs = counts / num_samples
        ### calculate the mean vector of each class $\mu_i$
        self.mu = np.zeros((num_classes, num_features))
        for k in range(num_samples):
            self.mu[int(y[k]),:] += X[k,:]
        
        for i in range(num_classes):
            if np.expand_dims(counts, 1)[i]!=0:
                self.mu[i,:] = self.mu[i] / np.expand_dims(counts, 1)[i]
        ### calculate the covariance matrix $\Sigma$
        values, counts = np.unique(y, return_counts = True)
        num_classes = len(values)
        Sigma_i = [np.cov(X[y == i].T)*(X[y == i].shape[0]-1) if i not in zero_index_values else 0 for i in range(num_classes)] 
        self.Sigma = sum(Sigma_i) / (X.shape[0]-num_classes)

        return self.mu, self.Sigma
        
    
    def linear_discriminant_func(self, X):
        '''
        calculate the linear discriminant functions $\delta_i(X)$
        
        Args: 
            X (matrix, num_samples*num_features): features of samples
            
        Returns:
            value (matrix, num_samples*num_classes): the linear discriminant function values. 
            The $(j,i)$-th entry of value represents $\delta_i(X[j,:])$, which is the linear discriminant function value for the class $i$ of the sample at row $j$.
        '''
        ### calculate the inverse matrix of the covariance matrix $\Sigma$
        U, S, V = np.linalg.svd(self.Sigma)
        Sn = np.linalg.inv(np.diag(S))
        Sigma_inv = np.dot(np.dot(V.T, Sn), U.T)
        ### calculate the linear discriminant function values of X
        value = np.dot(np.dot(X, Sigma_inv), self.mu.T) - \
                0.5 * np.multiply(np.dot(self.mu, Sigma_inv).T, self.mu.T).sum(axis = 0).reshape(1, -1) + \
                np.log(np.expand_dims(self.priorProbs, axis = 0))
        return value
    
    def predict(self, X):
        '''
        calculate the linear discriminant functions
        
        Args: 
            X (matrix, num_samples*num_features): features of samples
            
        Returns:
            pred_label (vector, num_samples): the predicted labels of samples. The $j$-th entry represents the predicted label of the sample at row $j$.
        '''
        pred_value = self.linear_discriminant_func(X)
        pred_label = np.argmax(pred_value, axis = 1)
        return pred_label

In [17]:
### initiate the LDA model
model = myLDA()
### fit the model with training data and get the estimation of mu and Sigma
mu, Sigma = model.fit(main()[0], main()[2])
### predict the label of test data
y_pred = model.predict(main()[1])
### calculate the accuracy of the fitted LDA model on test data
accuracy = np.sum(y_pred == main()[3])/len(main()[3])
print("Accuracy of LDA on the test dataset is {}.".format(accuracy))

Accuracy of LDA on the test dataset is 0.25308194343727336.
