In [None]:
import h5py
import numpy as np
import os
print(os.listdir("./Input/train"))

['labels_training.h5', 'images_training.h5']


In [None]:
with h5py.File('./Input/train/images_training.h5','r') as H:
    data_train = np.copy(H['datatrain'])
with h5py.File('./Input/train/labels_training.h5','r') as H:
    label_train = np.copy(H['labeltrain'])
with h5py.File('./Input/test/images_testing.h5','r') as H:
    data_test = np.copy(H['datatest'])
with h5py.File('./Input/test/labels_testing_2000.h5','r') as H:
    label_test = np.copy(H['labeltest'])


# using H['datatest'], H['labeltest'] for test dataset.
print(data_train.shape,label_train.shape)
print(data_test.shape,label_test.shape)


(30000, 784) (30000,)
(5000, 784) (2000,)


PCA MODEL

In [None]:
%%time
#STANDARDIZE THE TRAINING DATA
data_trainx = (data_train - np.mean(data_train,axis=0))/ np.std(data_train)
#STANDARDIZE THE TESTING DATA
data_testx = (data_test - np.mean(data_train,axis=0))/ np.std(data_train)
#DIMENSION REDUCTION OF TRAINING DATA USING SINGLE VALUE DECOMPOSITION
U, s, Vt = np.linalg.svd(data_trainx, full_matrices=False)
S = np.diag(s)
#CUMULATIVE VARIANCE EXPLAINED BY THE FEATURES
var_explained = np.cumsum(s**2/np.sum(s**2))
#NUMBER OF COMPONENTS THAT EXPLAIN 95% OF THE DATA
n_components = np.argmax(var_explained > 0.95)
#REDEFINE Vt
Vt = Vt[:n_components,:]
#TRAINING DATA TRANSFORMATION
data_train_transform = data_trainx.dot(Vt.T)
print("The dimensions of transformed data_train are: ",data_train_transform.shape)
#TESTING DATA TRANSFORMATION
data_test_transform = data_testx.dot(Vt.T)
print("The dimensions of transformed data_test are: ",data_test_transform.shape)

The dimensions of transformed data_train are:  (30000, 187)
The dimensions of transformed data_test are:  (5000, 187)
CPU times: user 11.9 s, sys: 1.19 s, total: 13.1 s
Wall time: 7.07 s


DIVIDE TRANSFORMED DATA INTO TRAINING AND VALIDATION DATA

In [None]:
%%time
#DIVIDE TRAINING DATA IN TRAIN AND VALIDATE PARTITIONS
n_comp = int(0.8*len(data_train_transform))                  #divide data into 80:20 ratio for training and validation
train_data, validate_data = data_train_transform[:n_comp], data_train_transform[n_comp:]
train_label,validate_label = label_train[:n_comp], label_train[n_comp:]
#SUBSET THE FIRST 2000 RWS OF TEST DATA TO CHECK OUR PERFORMANCE ON TEST DATA
data_try = data_test_transform[:2000,:]

print('The size of training data is : ',train_data.shape)
print('The size of validation data is : ',validate_data.shape)
print('The size of training labels is : ',train_label.shape)
print('The size of validation labels is :',validate_label.shape)
print('The size of test data subset is :',data_try.shape)


The size of training data is :  (24000, 187)
The size of validation data is :  (6000, 187)
The size of training labels is :  (24000,)
The size of validation labels is : (6000,)
The size of test data subset is : (2000, 187)
CPU times: user 2.33 ms, sys: 0 ns, total: 2.33 ms
Wall time: 7.95 ms


KNN CLASSIFIER

In [None]:
%%time
class KNN():
    def __init__(self, k):
        self.k = k
    #TRAIN THE CLASSIFIER
    def train(self,data_train,data_label):
        self.y = data_label 
        self.X = data_train        
   
    #COMPUTE THE DISTANCE BETWEEN THE LABELS
    def  compute_distance(self, X_test):
        num_train = self.X.shape[0]
        num_test = X_test.shape[0]  
        distances = np.zeros((num_test,num_train))

        for i in range(num_test):
            distances[i,:] = np.sqrt(np.sum((self.X - X_test[i,:])**2, axis =1))
        return distances

    #PREDICT THE LABELS BASED ON THE DISTANCE 
    def predict(self, X_test):
        distances = self.compute_distance(X_test)
        return self.predict_labels(distances)
    
    #PREDICT THE LABEL FOR A ROW BASED ON THE K NEAREST NEIGHBORS
    def predict_labels(self,distances):
        num_test = distances.shape[0]     #shape of test data
        y_pred = np.zeros(num_test)
                    
        for i in range(num_test):
          y_indices = np.argsort(distances[i,:]) 
          k_closest_classes = self.y[y_indices[:self.k]]             
          y_pred[i] = np.argmax(np.bincount(k_closest_classes))
          
        return y_pred

#RUN THE KNN CLASSIFIER
if __name__ == "__main__" :
    data_train = train_data
    data_label = train_label
    #SET K NEAREST NEIGHBOR VALUE
    k_nearest_neighbors = KNN(k = 6)
    #TRAIN THE KNN CLASSIFIER
    k_nearest_neighbors.train(data_train,data_label)
    
    

CPU times: user 960 µs, sys: 121 µs, total: 1.08 ms
Wall time: 547 µs



VALIDATE THE KNN CLASSIFIER USING THE VALIDATE DATA


In [None]:
%%time
#VALIDATE THE KNN CLASSIFIER ON VALIDATE_DATA DATASET
prediction = k_nearest_neighbors.predict(validate_data)
print("The accuracy for k = 6 is ", sum(prediction == validate_label)/validate_label.shape[0])

The accuracy for k = 6 is  0.86
CPU times: user 57.6 s, sys: 590 ms, total: 58.2 s
Wall time: 58.4 s


TEST THE KNN CLASSIFIER ON 2000 ROWS OF TEST DATASETS

In [None]:
%%time
#TEST THE KNN CLASSIFIER ON FIRST 2000 VALUES OF TEST DATASET
y_pred = k_nearest_neighbors.predict(data_try)
print("The accuracy for k = 6 is" , sum(y_pred == label_test)/label_test.shape[0])

The accuracy for k = 6 is 0.843
CPU times: user 18.8 s, sys: 101 ms, total: 18.9 s
Wall time: 18.9 s


PREDICT THE LABELS OF TRANSFORMED TEST DATA USING THE KNN CLASSIFIER 

In [None]:
%%time
#PREDICT LABELS USING KNN CLASSIFIER
predictions_knn = k_nearest_neighbors.predict(data_test_transform)
print("The shape of the knn predictions dataset is", predictions_knn.shape)
#print("The accuracy of knn classifier on the test data is" , sum(predictions_knn== labels_of_full_dataset)/labels_of_full_dataset.shape[0])


The shape of the knn predictions dataset is (5000,)
CPU times: user 46.1 s, sys: 209 ms, total: 46.3 s
Wall time: 46.4 s


NAIVE BAYES CLASSIFIER

In [None]:
%%time
class NaiveBayes:
    #FIT THE DATA INTO THE ALGORITHM
    def train(self,X,y):
        self._class = np.unique(y)
        n_samp, n_feat = X.shape       
        n_class = len(self._class)
        #INITIALISE MEAN, VARIANCE AND PRIOR PROBABILITIES
        self._priors = np.zeros(n_class, dtype = np.float64) 
        self._var = np.zeros((n_class, n_feat), dtype = np.float64)
        self._mean = np.zeros((n_class, n_feat), dtype = np.float64)

        for cl in self._class:
            X_cl =X[cl==y]
            self._var[cl,:] = X_cl.var(axis=0)           
            self._priors[cl] = X_cl.shape[0]/float(n_samp)
            self._mean[cl,:] = X_cl.mean(axis=0)
            

    #HELP FUNCTION TO CALCULATE THE PROBABILITY DENSITY
    def _pdf(self, x, class_id):
        variance = self._var[class_id]
        mean = self._mean[class_id]        
        prob_den = (np.exp(- (x-mean)**2/(2*variance)))/(np.sqrt(2*np.pi*variance))
        return prob_den

    #HELP FUNCTION TO PREDICT FUNCTION THE MOST PROBABLE LABEL
    def _predict(self,x):
        posterior = []
        for id, cl in enumerate(self._class):           
            class_cond = np.sum(np.log(self._pdf(x,id)))
            prior = np.log(self._priors[id])
            post = prior + class_cond
            posterior.append(post)

        return self._class[np.argmax(posterior)]

    #PREDICT THE LABELS OF THE DATA
    def predict(self,X):
        y_pred = [self._predict(x) for x in X]
        return y_pred
    
#RUN THE NAIVE BAYES CLASSIFIER
if __name__ == "__main__" :
    data_train = train_data
    data_label = train_label
    ba = NaiveBayes()
    #TRAIN THE CLASSIFIER
    ba.train(data_train, data_label)
    
   

CPU times: user 19.5 ms, sys: 0 ns, total: 19.5 ms
Wall time: 24.2 ms


VALIDATE THE NAIVE BAYES CLASSIFIER USING VALIDATE DATA

In [None]:
%%time
#VALIDATE THE ALGORITHM USING VALIDATE_DATA DATASET
predictions = ba.predict(validate_data)
print("Naive Bayes classifiers accuracy on validation data is: ", sum(predictions == validate_label)/validate_label.shape[0])

Naive Bayes classifiers accuracy on validation data is:  0.7536666666666667
CPU times: user 1.67 s, sys: 4.99 ms, total: 1.67 s
Wall time: 1.67 s


TEST THE NAIVE BAYES CLASSIFIER ON 2000 TEST DATASETS

In [None]:
%%time
#TEST THE NAIVE BAYES CLASSIFIER ON FIRST 2000 VALUES OF TEST DATASET
y_pred = ba.predict(data_try)
print("The accuracy for naive bayes classifier on 2000 values of test data is" , sum(y_pred == label_test)/label_test.shape[0])

The accuracy for naive bayes classifier on 2000 values of test data is 0.729
CPU times: user 745 ms, sys: 53.9 ms, total: 799 ms
Wall time: 732 ms


PREDICT THE LABELS OF TRANSFORMED TEST DATA USING THE NAIVE BAYES CLASSIFIER 

In [None]:
%%time
#PREDICT LABELS OF NEW DATA USING THE CLASSIFIER
predictions_bayes = np.asarray(ba.predict(data_test_transform))
print("The shape of the predicted data is",predictions_bayes.shape)
#print("The accuracy of naive bayes classifier on the test data is" , sum(prediction_bayes == labels_of_full_dataset)/labels_of_full_dataset.shape[0])

The shape of the predicted data is (5000,)
CPU times: user 1.52 s, sys: 41.7 ms, total: 1.56 s
Wall time: 1.51 s


LOGISTIC REGRESSION CLASSIFIER

In [None]:
%%time
import pandas as pd

#CROSS ENTROPY CALCULATION
def loss(p,p_hat):
  return -np.vdot(p, np.log(p_hat))

#LEARNING RATE CALCULATION
def evaluation(X,y, b):
  lr_rate = 0.0
  n_samp = X.shape[0]
  
  for i in range(n_samp):
    y_val = y[i]
    x = X[i]   
    prob = softmax(b @ x)
    lr_rate += loss(y_val, prob)
  return lr_rate

#SOFTMAX VALUE CALCULATION
def softmax(z):
  return np.exp(z)/np.sum(np.exp(z))

#GRADIENT DESCENT CALCULATION
def GradientDescent(X,y,alpha):  
  n_samp,n_feat = X.shape
  label_col = y.shape[1]
  num_epocs = 8
  lr_rate_vals = []
  #AUGMENT X
  X = np.insert(X, 0, 1, axis = 1)               
  b = np.zeros((label_col, n_feat+1))          
  
  for i in range(num_epocs):
    perm = np.random.permutation(n_samp)
    lr_rate = evaluation(X, y, b)
    lr_rate_vals.append(lr_rate)
    for r in perm:
      x = X[r]
      prob = softmax(b @ x)    
      y_val = y[r]     
      grad_Li = np.outer(prob - y_val, x)
      b -= alpha * grad_Li
  return b, lr_rate_vals

#FUNCTION TO PREDICT LABELS OF DATA
def predict(X, b):
  prediction = []
  X = np.insert(X, 0, 1, axis =1)
  n_samp = X.shape[0] 
  for i in range(n_samp):
    x = X[i] 
    prob = softmax(b @ x)      
    max_prob = np.argmax(prob)
    prediction.append(max_prob)
  return prediction

#TRAIN THE CLASSIFIER
train_label1 = pd.get_dummies(train_label).values
b,lr_rate_vals = GradientDescent(train_data,train_label1, alpha = 0.0001)

#VALIDATE THE CLASSIFIER ON VALIDATE_DATA DATASET 
prediction = predict(validate_data, b)
print("Logistic Regresion classifiers accuracy on validation data is: ", sum(prediction == validate_label)/validate_label.shape[0])

#TEST THE LOGISTIC REGRESSION CLASSIFIER ON FIRST 2000 VALUES OF TEST DATASET
y_pred = predict(data_try,b)
print("The accuracy of logistic regression classifier on 2000 instances of test data is" , sum(y_pred == label_test)/label_test.shape[0])

Logistic Regresion classifiers accuracy on validation data is:  0.8453333333333334
The accuracy of logistic regression classifier on 2000 instances of test data is 0.829
CPU times: user 8.41 s, sys: 7.86 ms, total: 8.42 s
Wall time: 8.43 s


PREDICT THE LABELS OF TRANSFORMED TEST DATA USING THE LOGISTIC REGRESSION CLASSIFIER



In [None]:
%%time
#PREDICT THE LABELS OF TRANSFORMED TEST DATA 
predictions_logisticreg = np.asarray(predict(data_test_transform, b))
print("The shape of the predicted data is",predictions_logisticreg.shape)
#print("The accuracy of logistic regression classifier on the test data is" , sum(predictions_logisticreg == labels_of_full_dataset)/labels_of_full_dataset.shape[0])

The shape of the predicted data is (5000,)
CPU times: user 83.4 ms, sys: 968 µs, total: 84.4 ms
Wall time: 83.9 ms


PRINT THE OUTPUT OF THE CLASIFIER WITH THE HIGHEST ACCURACY(KNN) TO AN OUTPUT FILE

In [None]:
import numpy as np
# assume output is the predicted labels
with h5py.File('Output.h5','w') as H:
  H.create_dataset('Output', data = predictions_knn)