# Imports

In [1]:
%matplotlib inline

import util
import numpy as np
import zipfile
import os
import doctest
import matplotlib.pyplot as plt
import copy

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Constants

In [2]:
DATUM_WIDTH = 0 # in pixels
DATUM_HEIGHT = 0 # in pixels

# Module Classes

In [3]:
# A datum is a pixel-level encoding of digits or face/non-face edge maps.

# Digits are from the MNIST dataset and face images are from the 
# easy-faces and background categories of the Caltech 101 dataset.


# Each digit is 28x28 pixels, and each face/non-face image is 60x74 
# pixels, each pixel can take the following values:
# 0: no edge (blank)
# 1: gray pixel (+) [used for digits only]
# 2: edge [for face] or black pixel [for digit] (#)

# Pixel data is stored in the 2-dimensional array pixels, which
# maps to pixels on a plane according to standard euclidean axes
# with the first dimension denoting the horizontal and the second
# the vertical coordinate:

# 28 # # # #      #  #
# 27 # # # #      #  #
#  .
#  .
#  .
#  3 # # + #      #  #
#  2 # # # #      #  #
#  1 # # # #      #  #
#  0 # # # #      #  #
#    0 1 2 3 ... 27 28

# For example, the + in the above diagram is stored in pixels[2][3], or
# more generally pixels[column][row].

# The contents of the representation can be accessed directly
# via the getPixel and getPixels methods.

In [4]:
class Datum:
    # Create a new datum from file input (standard MNIST encoding).
    def __init__(self, data, width, height):
        DATUM_HEIGHT = height
        DATUM_WIDTH= width
        self.height = DATUM_HEIGHT
        self.width = DATUM_WIDTH
        if data == None:
              data = [[' ' for i in range(DATUM_WIDTH)] for j in range(DATUM_HEIGHT)] 
        self.pixels = util.arrayInvert(convertToInteger(data)) 

    # Returns the value of the pixel at column, row as 0, or 1.
    def getPixel(self, column, row):
        return self.pixels[column][row]

    # Returns all pixels as a list of lists.
    def getPixels(self):
        return self.pixels
    
    def setPixels(self, pixels):
        self.pixels = pixels

    # Renders the data item as an ascii image.
    def getAsciiString(self):
        rows = []
        data = util.arrayInvert(self.pixels)
        for row in data:
            ascii = map(asciiGrayscaleConversionFunction, row)
            rows.append( "".join(ascii))
        return "\n".join(rows)

    def __str__(self):
        return self.getAsciiString()

# Data processing, cleanup and display functions

In [5]:
#   Reads n data images from a file and returns a list of Datum objects.
#   (Return less then n items if the end of file is encountered).
def loadDataFile(filename, n,width,height): 
    DATUM_WIDTH =width
    DATUM_HEIGHT=height
    fin = readlines(filename)
    fin.reverse()
    items = []
    for i in range(n):
        data = []
        for j in range(height):
            data.append(list(fin.pop()))
        if len(data[0]) < DATUM_WIDTH-1:
            # we encountered end of file...
            print ("Truncating at %d examples (maximum)" % i)
            break
        items.append(Datum(data, DATUM_WIDTH, DATUM_HEIGHT))
    return items

In [6]:
# Opens a file or reads it from the zip archive data.zip
def readlines(filename):
      if(os.path.exists(filename)): 
        return [l[:-1] for l in open(filename).readlines()]
      else: 
        print(os.getcwd())
        z = zipfile.ZipFile('./data.zip')
        liste= z.read(filename).decode("utf-8").split("\n")
        print(len(liste))
        return liste

In [7]:
#   Reads n labels from a file and returns a list of integers.
def loadLabelsFile(filename, n):
    fin = readlines(filename)
    labels = []
    for line in fin[:min(n, len(fin))]:
        if line == '':
            break
        labels.append(int(line))
    return labels

In [8]:
#   Helper function for display purposes.
def asciiGrayscaleConversionFunction(value):
    if(value == 0):
        return ' '
    elif(value == 1):
        return '+'
    elif(value == 2):
        return '#'    

In [9]:
# #   Helper function for file reading.
# def IntegerConversionFunction(character):
#     if(character == ' '):
#         return 0
#     elif(character == '+'):
#         return 1
#     elif(character == '#'):
#         return 2    

In [10]:
#   Helper function for file reading.
def IntegerConversionFunction(character):
    if(character == '+'):
        return 1
    elif(character == '#'):
        return 2
    else:
        return 0

In [11]:
#   Helper function for file reading.
def convertToInteger(data):
    if type(data) != type([]):
        return IntegerConversionFunction(data)
    else:
        return list(map(convertToInteger, data))

In [12]:
def fixFaceDataset(items):
    new_items = copy.deepcopy(items)
    
    list_zeros = [0] * 70
    all_pixels = list()
    
    for i in range(len(new_items)):
        all_pixels = items[i].pixels[0:60]
        
        for j in range(10):
            all_pixels.append(list_zeros)
            
        new_items[i].setPixels(all_pixels)
        
    return new_items

# Digit Dataset

## Training Dataset

In [13]:
digit_training_amount = 5000
digit_training_labels = loadLabelsFile("./data/digitdata/traininglabels", digit_training_amount)
digit_training_items = loadDataFile("./data/digitdata/trainingimages", digit_training_amount,28,28)
digit_training_features= np.array([np.array(item.getPixels()).flatten() for item in digit_training_items])

## Validation Dataset

In [14]:
digit_validation_amount = 1000
digit_validation_labels = loadLabelsFile("./data/digitdata/validationlabels", digit_validation_amount)
digit_validation_items = loadDataFile("./data/digitdata/validationimages", digit_validation_amount, 28, 28)
digit_validation_features= np.array([np.array(item.getPixels()).flatten() for item in digit_validation_items])

## Test Dataset

In [15]:
digit_test_amount = 1000
digit_test_labels = loadLabelsFile("./data/digitdata/testlabels", digit_test_amount)
digit_test_items = loadDataFile("./data/digitdata/testimages", digit_test_amount, 28, 28)
digit_test_features= np.array([np.array(item.getPixels()).flatten() for item in digit_test_items])

# Face dataset

## Training Dataset

In [16]:
face_training_amount = 451
face_training_labels = loadLabelsFile("./data/facedata/facedatatrainlabels", face_training_amount)
face_training_items = fixFaceDataset(loadDataFile("./data/facedata/facedatatrain", face_training_amount,60,70))
face_training_features= np.array([np.array(item.getPixels()).flatten() for item in face_training_items])

## Validation Dataset

In [17]:
face_validation_amount = 301
face_validation_labels = loadLabelsFile("./data/facedata/facedatavalidationlabels", face_validation_amount)
face_validation_items = fixFaceDataset(loadDataFile("./data/facedata/facedatavalidation", face_validation_amount,60,70))
face_validation_features= np.array([np.array(item.getPixels()).flatten() for item in face_validation_items])

## Test Dataset

In [18]:
face_test_amount = 150
face_test_labels = loadLabelsFile("./data/facedata/facedatatestlabels", face_test_amount)
face_test_items = fixFaceDataset(loadDataFile("./data/facedata/facedatatest", face_test_amount,60,70))
face_test_features= np.array([np.array(item.getPixels()).flatten() for item in face_test_items])

# Naive Bayes Classifier

In [19]:
def predictNaiveBayes(model, data, labels, dataName):
    
    prediction = np.array([])
    correct_prediction = 0

    for i in range(len(data)):
        
        # Predict Output
        predicted= model.predict([data[i]])[0]
        np.append(prediction, predicted)
        
        if predicted == labels[i]:
            correct_prediction += 1
        
    print ("Accuracy of the Naive Bayes on the {}".format(dataName),
           "dataset is {0:.2f} %".format(correct_prediction*100/len(data)))
    
    return prediction

## Digit Dataset

In [20]:
# Create a Gaussian Classifier
digit_NB_model = GaussianNB()

# Train the model using the training sets
digit_NB_model.fit(digit_training_features, digit_training_labels)

digit_training_prediction = predictNaiveBayes(digit_NB_model, digit_training_features, digit_training_labels, "Training")

digit_validation_prediction = predictNaiveBayes(digit_NB_model, digit_validation_features, 
                                                digit_validation_labels, "Validation")

digit_test_prediction = predictNaiveBayes(digit_NB_model, digit_test_features, digit_test_labels, "Test")

Accuracy of the Naive Bayes on the Training dataset is 60.50 %
Accuracy of the Naive Bayes on the Validation dataset is 55.60 %
Accuracy of the Naive Bayes on the Test dataset is 50.90 %


## Face Dataset

In [21]:
# Create a Gaussian Classifier
face_NB_model = GaussianNB()

# Train the model using the training sets
face_NB_model.fit(face_training_features, face_training_labels)

face_training_prediction = predictNaiveBayes(face_NB_model, face_training_features, face_training_labels, "Training")
face_validation_prediction = predictNaiveBayes(face_NB_model, face_validation_features, face_validation_labels, "Validation")
face_test_prediction = predictNaiveBayes(face_NB_model, face_test_features[:17], face_test_labels, "Test")

Accuracy of the Naive Bayes on the Training dataset is 94.01 %
Accuracy of the Naive Bayes on the Validation dataset is 84.72 %
Accuracy of the Naive Bayes on the Test dataset is 94.12 %


# KNN (K Nearest Neighbors)

In [22]:
def predictKNN(k,model, data, labels, dataName):
    
    prediction = np.array([])
    correct_prediction = 0

    for i in range(len(data)):
        
        # Predict Output
        predicted= model.predict([data[i]])[0]
        np.append(prediction, predicted)
        
        if predicted == labels[i]:
            correct_prediction += 1
        
    accuracy = correct_prediction*100/len(data)    
    print ("Accuracy of KNN with k={} on the {}".format(k, dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return prediction, accuracy

## Digit dataset 

In [None]:
highestfull_k = 2
highestfull_accuarcy = 0

highest_k = 2
highest_accuarcy = 0
for k in range(2, digit_training_amount):
    # Create a KNN Classifier
    digit_KNN_model = KNeighborsClassifier(n_neighbors=k)

    # Train the model using the training sets
    digit_KNN_model.fit(digit_training_features, digit_training_labels)

    digit_training_prediction, acc1 = predictKNN(k, digit_KNN_model, digit_training_features, digit_training_labels, "Training")
    digit_validation_prediction, acc2 = predictKNN(k, digit_KNN_model, digit_validation_features, digit_validation_labels, "Validation")
    digit_test_prediction, acc3 = predictKNN(k, digit_KNN_model, digit_test_features, digit_test_labels, "Test")
    
    if (highestfull_accuarcy < acc1+acc2+acc3):
        highestfull_accuarcy = acc1+acc2+acc3
        highestfull_k = k
    
    if (highest_accuarcy < acc2+acc3):
        highest_accuarcy = acc2 + acc3
        highest_k = k
    
    print ("k={} with highest accuarcy of {}/200".format(highest_k, highest_accuarcy))
    print ("k={} with Full accuarcy of {}/300".format(highestfull_k, highestfull_accuarcy))

    print("\n \n")

In [None]:
k = 3
# Create a KNN Classifier
digit_KNN_model = KNeighborsClassifier(n_neighbors=k)

# Train the model using the training sets
digit_KNN_model.fit(digit_training_features, digit_training_labels)

digit_training_prediction = predictKNN(k, digit_KNN_model, digit_training_features, digit_training_labels, "Training")[0]
digit_validation_prediction = predictKNN(k, digit_KNN_model, digit_validation_features, digit_validation_labels, "Validation")[0]
digit_test_prediction = predictKNN(k, digit_KNN_model, digit_test_features, digit_test_labels, "Test")[0]
print("\n \n")

## Face dataset

In [None]:
highestfull_k = 2
highestfull_accuarcy = 0

highest_k = 2
highest_accuarcy = 0
for k in range(2,face_training_amount):

    # Create a KNN Classifier
    face_KNN_model = KNeighborsClassifier(n_neighbors=k)

    # Train the model using the training sets
    face_KNN_model.fit(face_training_features, face_training_labels)

    
    face_training_prediction, acc1 = predictKNN(k, face_KNN_model, face_training_features, face_training_labels, "Training")
    face_validation_prediction, acc2 = predictKNN(k, face_KNN_model, face_validation_features, face_validation_labels, "Validation")
    face_test_prediction, acc3 = predictKNN(k, face_KNN_model, face_test_features, face_test_labels, "Test")
    
    if (highestfull_accuarcy < acc1+acc2+acc3):
        highestfull_accuarcy = acc1+acc2+acc3
        highestfull_k = k
    
    if (highest_accuarcy < acc2+acc3):
        highest_accuarcy = acc2 + acc3
        highest_k = k
    
    print ("k={} with highest accuarcy of {}/200".format(highest_k, highest_accuarcy))
    print ("k={} with Full accuarcy of {}/300".format(highestfull_k, highestfull_accuarcy))

    print("\n \n")

In [None]:
k = 109

# Create a KNN Classifier
face_KNN_model = KNeighborsClassifier(n_neighbors=k)

# Train the model using the training sets
face_KNN_model.fit(face_training_features, face_training_labels)

face_training_prediction = predictKNN(k, face_KNN_model, face_training_features, face_training_labels, "Training")[0]
face_validation_prediction = predictKNN(k, face_KNN_model, face_validation_features, face_validation_labels, "Validation")[0]
face_test_prediction = predictKNN(k, face_KNN_model, face_test_features, face_test_labels, "Test")[0]

In [None]:
# n = 4
# print(face_training_items[n])
# print(len(face_training_items))

# #Predict Output
# predicted= face_NB_model.predict([face_training_features[n]])[0]
# print ("Predicted Value:", predicted)

In [None]:
# n = 4
# print(face_validation_items[n])
# print(len(face_validation_items))

# #Predict Output
# predicted= face_NB_model.predict([face_validation_features[n]])[0]
# print ("Predicted Value:", predicted)

In [None]:
# n = 16
# print(face_test_items[n])
# print(len(face_test_items))

# #Predict Output
# predicted= face_NB_model.predict([face_test_features[n]])[0]
# print ("Predicted Value:", predicted)

In [None]:
# n = 2
# print(digit_training_items[n])
# print(len(digit_training_items))

# #Predict Output
# predicted= digit_NB_model.predict([digit_training_features[n]])[0]
# print ("Predicted Value:", predicted)

In [None]:
# n = 1
# print(digit_validation_items[n])
# print(len(digit_validation_items))

# #Predict Output
# predicted= digit_NB_model.predict([digit_validation_features[n]])[0]
# print ("Predicted Value:", predicted)

In [None]:
# n = 4
# print(digit_test_items[n])
# print(len(digit_test_items))

# # Predict Output
# predicted= digit_NB_model.predict([digit_test_features[n]])[0]
# print ("Predicted Value:", predicted)

In [None]:
# pca_reduction = PCA(n_components=1)
# pca_reduction.fit(training_features)
# training_features_pca = pca_reduction.transform(training_features)
# new_training_features = pca_reduction.inverse_transform(training_features_pca)
# training_prediction = predictNaiveBayes(new_training_features, training_labels, "Training")

In [None]:
# def _test():
#     doctest.testmod() # Test the interactive sessions in function comments
    
#     n = 451 # face data limit
#     items = fixFaceDataset(loadDataFile("./data/facedata/facedatatrain", n,60,70))
#     labels = loadLabelsFile("./data/facedata/facedatatrainlabels", n)

# #     n = 5000 # digit data limit
# #     items = loadDataFile("./data/digitdata/trainingimages", n,28,28)
# #     labels = loadLabelsFile("./data/digitdata/traininglabels", n)

    
# #     for i in range(1):
# #         print (items[i].getAsciiString())
# #         print (items[i])
# #         print (items[i].height)
# #         print (items[i].width)
# #         print (dir(items[i]))
# #         print ((items[i].getPixels()))
# #         print (len(items[i].getPixels()))

    
# _test()

In [None]:
# def pca_feature(feature):
# #     print(feature)
#     pca_reduction = PCA(n_components=1)
#     pca_reduction.fit(feature)
#     feature_pca = pca_reduction.transform(feature)
#     print(feature_pca.shape)
# #     new_feature = pca_reduction.inverse_transform(feature_pca)
# #     print(new_feature)
#     return feature_pca