## K-NN Classifier (Run the cell sequentially)

In [1]:
import h5py
import numpy as np
from collections import Counter
import time
import pandas as pd

# load data .h5 to numpy array
def load_data():
    
    #load training labels
    with h5py.File('../Input/labels_training.h5','r') as H:
        label=np.copy(H['label'])
    trainLabel=label
    
    #load training data
    with h5py.File('../Input/images_training.h5','r') as H:
        data=np.copy(H['data'])
    trainData=data
    
    #load test labels
    with h5py.File('../Input/labels_testing_2000.h5','r') as H:
        label=np.copy(H['label'])
    testLabel=label
    
    #load test data
    with h5py.File('../Input/images_testing.h5','r') as H:
        data=np.copy(H['data'])
    testData=data
    
    return trainLabel, trainData, testLabel, testData
    

# preprocess training & test data: flatten and reshaping    
def preprocess_data(trainData, testData):
    
    # normalise training data from 0-255 to 0-1 (float)
    trainData=trainData/255.0
    
    # normalise testing data from 0-255 to 0-1 (float)
    testData=testData/255.0
    
    # reshape training data from 28*28 to 784
    newTrain= trainData.flatten().reshape(30000, 784)
    newTrain.shape
    
    # reshape test data from 28*28 to 784
    newTest= testData.flatten().reshape(5000, 784)
    newTest.shape

    return newTrain, newTest  

# create method to get distance from vector test to vector train
def get_distances(newTest):

    # get square value of test data
    sqTest = np.square(newTest)
    
    # formed into (newTest,) matrix
    totalTest=sqTest.sum(axis=1)
        
    # get square value of test data
    sqTrain = np.square(newTrain)
    
    # formed into (30000,) matrix
    totalTrain=sqTrain.sum(axis = 1)
    
    # get dot product of vectors
    dotProduct = np.dot(newTest, newTrain.T)
    
    #calculate distance
    distances = np.sqrt(-2 * dotProduct + totalTrain + np.matrix(totalTest).T)
    
    #distances=np.linalg.norm(newTest-newTrain)
    return(distances)
    
def get_prediction(newTest, k):
    
    # call get_distance() and store the value to distances
    distances=[]
    distances = get_distances(newTest)
    
    #create new array with size equal to size of distances
    totalTest = distances.shape[0]
    
    # create temporary array of labels with value 0 and size equal to totalTest 
    predictedClass = np.zeros(totalTest)
    
    for x in range(totalTest):
        topK=[]
    
        # get the distances index which equivalen to label index, then store in labels as flat array
        labels = trainLabel[np.argsort(distances[x,:])].flatten()

        # get the top-K labels from labels data 
        topK = labels[:k]
        c = Counter(topK)
        predictedClass[x] = c.most_common(1)[0][0]

    return(predictedClass)  

def export_CSV():
    
    out_file = open("predictions_knn2000.csv", "w")
    out_file.write("ImageId,Label,Actual \n")
    for i in range(len(predictions)):
        out_file.write(str(i+1) + "," + str(int(predictions[i])) + "," + str(testLabel[i])+ "\n")
    out_file.close()
    
def export_CSV2():
    
    out_file = open("predictions_knn5000.csv", "w")
    out_file.write("ImageId,Label\n")
    for i in range(len(predictions)):
        out_file.write(str(i+1) + "," + str(int(predictions[i]))+ "\n")
    out_file.close()
    
def predict_2000(batchSize,k):
    predictions = []
    for x in range(int(len(newTest)/(2*batchSize))):
        print("Starting prediction: " + str((x+1)*batchSize) + " of " + str(int(len(newTest))))
        start = time.time()
        predictionResult = get_prediction(newTest[x * batchSize:(x+1) * batchSize], k)
        end = time.time()
        predictions = predictions + list(predictionResult)
        print("Completed in " + str(round((end-start),3)) + " Secs.")
   
    return (predictions)

def predict_5000(batchSize, k, predictions):
    for x in range(int(len(newTest)/(2*batchSize)), int(len(newTest)/batchSize)):
        print("Starting prediction: " + str((x+1)*batchSize) + " of " + str(int(len(newTest))))
        start = time.time()
        predictionResult = get_prediction(newTest[x * batchSize:(x+1) * batchSize], k)
        end = time.time()
        predictions = predictions + list(predictionResult)
        print("Completed in " + str(round((end-start),3)) + " Secs.")
        
    return (predictions)


# construct confusion matrix for analysis

def calConfusionMatrix(predictions):
    # calculate the confusion matrix; labels is numpy array of classification labels
    predictions=np.asarray(predictions)
    predictions=predictions.astype(int)
    classCount=len(np.unique(trainLabel))
    cm = np.zeros(shape = (classCount, classCount))
    
    # loop through the prediction and actual result
    for a, p in zip(testLabel, predictions):
        #cm[a][p] += 1
        cm[a,p] += 1
    # return confusion matrix 
    return cm


# get the analysis based on matrix
def predictions_analysis(confMatrix):
       
    # copy a new matrix based on the supplied matrix
    newMatrix = np.array(confMatrix)
        
    # true positives is the sum of diagonal values
    TP=np.diag(newMatrix)
    
    # false negatives is the sum of row without TP
    FN = np.sum(newMatrix, axis=1) - TP
    
    # false positive is the sum of column without TP
    FP = np.sum(newMatrix, axis=0) - TP
    
    # true negative
    num_classes = 10
    TN = []
    for x in range(num_classes):
        temp = np.delete(newMatrix, x, 0)    # delete ith row
        temp = np.delete(temp, x, 1)  # delete ith column
        TN.append(sum(sum(temp)))
        
    # calculate precision, recall, accuracy, F1
    precision = pd.DataFrame((TP/(TP+FP)),columns=['Precision'])
    i=np.array(precision)
    recall = pd.DataFrame((TP/(TP+FN)), columns=['Recall'])
    j=np.array(recall)
    accuracy = (np.sum(TP)/2000)*100
    F1= pd.DataFrame((2*(i*j)/(i+j)),columns=['F1-Score'])

    print ("\n Accuracy: "+str(accuracy) +"%" +"\n")
    print (round(precision,3))
    print()
    print (round(recall,3))
    print()
    print (round(F1,3))

In [2]:
# Step 1: run load data method
trainLabel, trainData, testLabel, testData=load_data()

In [3]:
# Step 2: run preprocess method
newTrain,newTest=preprocess_data(trainData,testData)

In [4]:
# Step 3: Sanity check for training, test, & label data
print(newTrain.shape, newTest.shape, trainLabel.shape)

(30000, 784) (5000, 784) (30000,)


In [5]:
# Step 4: set batch size & number of K
batchSize = 1000
k=3

In [6]:
# Step 5: run for 2000 test & analyze the performance
predictions=predict_2000(batchSize,k)
#export_CSV()
confMatrix=calConfusionMatrix(predictions)
print(confMatrix)

Starting prediction: 1000 of 5000
Completed in 3.216 Secs.
Starting prediction: 2000 of 5000
Completed in 3.111 Secs.
[[148.   0.   3.   7.   1.   0.  18.   0.   1.   0.]
 [  1. 187.   0.   3.   0.   0.   0.   0.   0.   0.]
 [  4.   0. 169.   1.  18.   0.  18.   0.   0.   0.]
 [  5.   0.   6. 170.   7.   0.   3.   0.   0.   0.]
 [  0.   0.  24.   7. 158.   0.  22.   0.   1.   0.]
 [  0.   0.   0.   0.   0. 162.   1.  25.   1.  25.]
 [ 35.   0.  27.   6.  20.   0. 112.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   1.   0. 188.   0.   9.]
 [  1.   0.   2.   2.   2.   0.   1.   0. 210.   1.]
 [  0.   0.   0.   0.   0.   0.   1.   5.   0. 181.]]


In [7]:
# Step 6: run the analysis based on the matrix
predictions_analysis(confMatrix)


 Accuracy: 84.25%

   Precision
0      0.763
1      1.000
2      0.732
3      0.867
4      0.767
5      0.994
6      0.636
7      0.862
8      0.986
9      0.838

   Recall
0   0.831
1   0.979
2   0.805
3   0.890
4   0.745
5   0.757
6   0.560
7   0.949
8   0.959
9   0.968

   F1-Score
0     0.796
1     0.989
2     0.766
3     0.879
4     0.756
5     0.859
6     0.596
7     0.904
8     0.972
9     0.898


In [8]:
# Step 7: run for the remaining 3000 test
predictions=predict_5000(batchSize,k, predictions)
# export CSV for 5000 test only (exclude label)
#export_CSV2()

Starting prediction: 3000 of 5000
Completed in 3.13 Secs.
Starting prediction: 4000 of 5000
Completed in 3.123 Secs.
Starting prediction: 5000 of 5000
Completed in 3.251 Secs.


In [9]:
## Note : all classifier model has the same output name, please kindly take note
# Step 8: save 5000 predicted label to .h5
predictions=np.asarray(predictions)
predictions=predictions.astype(int)
with h5py.File('../Output/predicted_labels.h5','w') as H:
    H.create_dataset('predictions', data=predictions)

In [10]:
# Step 9: sanity check predicted_labels.h5
with h5py.File('../Output/predicted_labels.h5','r') as H:
    predictions=np.copy(H['predictions'])
    sanityCheck=predictions.astype(int)
print(sanityCheck.shape, sanityCheck[:10])

(5000,) [3 4 4 1 0 2 4 5 7 6]
