In [107]:
#-------------Imports--------------
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import scipy.optimize as optimize
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
import warnings
import math

In [46]:
# ignore some warnings about some scipy function getting changed in the next update
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#-----------Load Dataset-----------

path_train = "fashion_train.npy"
path_test = "fashion_test.npy"

train = np.load(path_train)
test = np.load(path_test)

#Split the training and test data into features and labels
X_train = train[:,:784]
y_train = train[:,784]

X_test = test[:,:784]
y_test = test[:,784]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(10000, 784)
(10000,)
(5000, 784)
(5000,)


## LDA

(will use first 2 linear discriminant variables as features with some of the classifiers)

In [3]:
def get_LDA_components(X,y):
    
    def get_Sw(X, y):
    
        N = X.shape[1] #number of features
        S_w = np.zeros((N,N))
        class_labels = np.unique(y)
        c = class_labels.shape[0] #number of classes

        #calculate scatter matrix for each class
        for class_ in range(c):

            S_i = np.zeros((N,N))
            class_subset = X[y == class_] #get rows which are a part of the current class
            mean_vector = (np.mean(class_subset, axis=0)).reshape(N, 1) #vector m_i containing
            #means of all features in class i

            for row_idx in range(class_subset.shape[0]):

                x = (class_subset[row_idx, :]).reshape(N, 1)
                S_i += (np.dot((x - mean_vector), np.transpose(x - mean_vector))) #apply formula for within class scatter matrix

            S_w += S_i

        return S_w
    #--------------Compute Between Class Scatter Matrix---------------
    def get_Sb(X, y):
    
        N = X.shape[1] #number of features
        m = (np.mean(X, axis=0)).reshape(N,1) #overall mean
        S_b = np.zeros((N,N))
        class_labels = np.unique(y)
        c = class_labels.shape[0] #number of classes

        for class_ in range(c):

            class_subset = X[y == class_]
            n_rows = class_subset.shape[0] #get number of rows which are a part of the current class
            mean_vector = (np.mean(class_subset, axis=0)).reshape(N, 1) #vector m_i containing
            #means of all features in class i
            S_b += n_rows * ((mean_vector - m).dot((mean_vector - m).T)) #apply formula for between class scatter matrix

        return S_b
    
    def get_linear_discriminants(S_w, S_b):
    
        # calculate the eigenvectors and eigenvalues of the matrix ((S_w)^-1)(S_b)
        eig_vals, eig_vecs = np.linalg.eig((np.linalg.inv(S_w)).dot(S_b))

        eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] #create a list of corresponding
        #eigenvectors and eigenvalues
        eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
        #sort the list by the eigenvalues in decreasing order

        return eig_pairs
    
    N = X.shape[1] #get number of features
    S_w = get_Sw(X, y) #get within class scatter matrix
    S_b = get_Sb(X, y) #get between class scatter matrix
    
    sorted_eigenvecs = get_linear_discriminants(S_w, S_b) #get linear discriminants sorted by
    #variance explained in descending order (most descriptive first)
    
    #get first 2 linear discriminants
    W = np.hstack((sorted_eigenvecs[0][1].reshape(N,1), sorted_eigenvecs[1][1].reshape(N,1)))
    
    #transform the samples onto the new subspace
    #transformed = X.dot(W)
    
    return W

In [4]:
W = get_LDA_components(X_train,y_train)

In [5]:
X_lda = X_train.dot(W)

In [10]:
X_lda = X_lda.real

In [55]:
scaler = StandardScaler()
X_lda_std = scaler.fit_transform(X_lda)

## KNN

### Train and predict using different hyperparameters

5 neighbours

In [47]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_lda, y_train)

KNeighborsClassifier()

In [48]:
pred_knn = neigh.predict(X_lda)

3 neighbours

In [49]:
neigh3 = KNeighborsClassifier(n_neighbors=3)
neigh3.fit(X_lda, y_train)

KNeighborsClassifier(n_neighbors=3)

In [50]:
pred_3n = neigh3.predict(X_lda)

### Try the same for the standardized features

In [57]:
neigh5_std = KNeighborsClassifier(n_neighbors=5)
neigh5_std.fit(X_lda_std, y_train)
pred5_std = neigh5_std.predict(X_lda_std)

In [58]:
neigh3_std = KNeighborsClassifier(n_neighbors=3)
neigh3_std.fit(X_lda_std, y_train)
pred3_std = neigh3_std.predict(X_lda_std)

### Try to find optimal parameters using grid search

In [51]:
clf = KNeighborsClassifier(n_neighbors = 5)

param_grid = {'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_lda, y_train)

print("Best k value:", grid_search.best_params_['n_neighbors'])
print("Best Score:", grid_search.best_score_)

Best k value: 16
Best Score: 0.7734


### Training accuracy

5 neighbors

In [16]:
train_acc = accuracy_score(pred_knn, y_train)
train_acc

0.8147

3 neighbours

In [28]:
train_acc3 = accuracy_score(pred_3n, y_train)
train_acc3

0.8349

16 neighbors (suggested by grid search)

In [52]:
best_neigh = KNeighborsClassifier(n_neighbors=16)
best_neigh.fit(X_lda, y_train)

KNeighborsClassifier(n_neighbors=16)

In [53]:
pred_ = best_neigh.predict(X_lda)

In [54]:
accuracy_score(pred_, y_train)

0.7919

### Same for standardized features

Very slightly better with the standardized features

In [59]:
train_acc3_std = accuracy_score(pred3_std, y_train)
train_acc3_std

0.8364

In [60]:
train_acc5_std = accuracy_score(pred5_std, y_train)
train_acc5_std

0.8169

## Test accuracy for chosen model - 5 neighbours

(with standardized features)

In [42]:
X_lda_test = X_test.dot(W)
X_lda_test = X_lda_test.real

In [61]:
scaler2 = StandardScaler()
X_lda_test_std = scaler2.fit_transform(X_lda_test)

In [64]:
pred_test = neigh5_std.predict(X_lda_test_std)

In [65]:
accuracy_score(pred_test, y_test)

0.7126

## SVM

First standardize the features, then fit the SVM

RBF kernel

In [84]:
svm_1 = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm_1.fit(X_lda, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [85]:
pred_svm_1 = svm_1.predict(X_lda)

In [86]:
accuracy_score(pred_svm_1, y_train)

0.7803

Polynomial kernel

In [90]:
svm_2 = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel = 'poly', degree = 3))
svm_2.fit(X_lda, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto', kernel='poly'))])

In [91]:
pred_svm_2 = svm_2.predict(X_lda)

In [92]:
accuracy_score(pred_svm_2, y_train)

0.7458

### Try to find optimal parameters using grid search

In [95]:
clf_svm = SVC(gamma='auto')

param_grid_svm = {'gamma': ['auto', 'scale'], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
grid_search_svm = GridSearchCV(clf_svm, param_grid_svm, cv=5, scoring="accuracy")
grid_search_svm.fit(X_lda_std, y_train)

print("Best gamma value:", grid_search_svm.best_params_['gamma'])
print("Best kernel value:", grid_search_svm.best_params_['kernel'])
print("Best Score:", grid_search_svm.best_score_)

Best gamma value: auto
Best kernel value: linear
Best Score: 0.7809999999999999


### Training accuracy with parameters suggested by grid search

In [96]:
best_svm = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel = 'linear'))
best_svm.fit(X_lda, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto', kernel='linear'))])

In [97]:
best_svm_pred = best_svm.predict(X_lda) 

In [98]:
accuracy_score(best_svm_pred, y_train)

0.7812

## Test accuracy for chosen model - SVM with rbf kernel

(and feature scaling)

In [93]:
pred_svm_test = svm_1.predict(X_lda_test)

In [94]:
accuracy_score(pred_svm_test, y_test)

0.731

## Naive Bayes from sklearn

### Gaussian Naive Bayes

In [102]:
gaussian_nb_1 = GaussianNB()
gaussian_nb_1.fit(X_lda, y_train)

GaussianNB()

### Training accuracy

In [103]:
pred_gnb = gaussian_nb_1.predict(X_lda)

In [104]:
accuracy_score(pred_gnb, y_train)

0.7722

### Test accuracy

In [105]:
pred_gnb_test = gaussian_nb_1.predict(X_lda_test)

In [106]:
accuracy_score(pred_gnb_test, y_test)

0.7238