# Imports

In [1]:
%matplotlib inline

import util
import numpy as np
import zipfile
import os
import doctest
import matplotlib.pyplot as plt
import copy

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Constants

In [2]:
DATUM_WIDTH = 0 # in pixels
DATUM_HEIGHT = 0 # in pixels

# Module Classes

In [3]:
# A datum is a pixel-level encoding of digits or face/non-face edge maps.

# Digits are from the MNIST dataset and face images are from the 
# easy-faces and background categories of the Caltech 101 dataset.


# Each digit is 28x28 pixels, and each face/non-face image is 60x74 
# pixels, each pixel can take the following values:
# 0: no edge (blank)
# 1: gray pixel (+) [used for digits only]
# 2: edge [for face] or black pixel [for digit] (#)

# Pixel data is stored in the 2-dimensional array pixels, which
# maps to pixels on a plane according to standard euclidean axes
# with the first dimension denoting the horizontal and the second
# the vertical coordinate:

# 28 # # # #      #  #
# 27 # # # #      #  #
#  .
#  .
#  .
#  3 # # + #      #  #
#  2 # # # #      #  #
#  1 # # # #      #  #
#  0 # # # #      #  #
#    0 1 2 3 ... 27 28

# For example, the + in the above diagram is stored in pixels[2][3], or
# more generally pixels[column][row].

# The contents of the representation can be accessed directly
# via the getPixel and getPixels methods.

In [4]:
class Datum:
    # Create a new datum from file input (standard MNIST encoding).
    def __init__(self, data, width, height):
        DATUM_HEIGHT = height
        DATUM_WIDTH= width
        self.height = DATUM_HEIGHT
        self.width = DATUM_WIDTH
        if data == None:
              data = [[' ' for i in range(DATUM_WIDTH)] for j in range(DATUM_HEIGHT)] 
        self.pixels = util.arrayInvert(convertToInteger(data)) 

    # Returns the value of the pixel at column, row as 0, or 1.
    def getPixel(self, column, row):
        return self.pixels[column][row]

    # Returns all pixels as a list of lists.
    def getPixels(self):
        return self.pixels
    
    def setPixels(self, pixels):
        self.pixels = pixels

    # Renders the data item as an ascii image.
    def getAsciiString(self):
        rows = []
        data = util.arrayInvert(self.pixels)
        for row in data:
            ascii = map(asciiGrayscaleConversionFunction, row)
            rows.append( "".join(ascii))
        return "\n".join(rows)

    def __str__(self):
        return self.getAsciiString()

# Data processing, cleanup and display functions

In [5]:
#   Reads n data images from a file and returns a list of Datum objects.
#   (Return less then n items if the end of file is encountered).
def loadDataFile(filename, n,width,height): 
    DATUM_WIDTH =width
    DATUM_HEIGHT=height
    fin = readlines(filename)
    fin.reverse()
    items = []
    for i in range(n):
        data = []
        for j in range(height):
            data.append(list(fin.pop()))
        if len(data[0]) < DATUM_WIDTH-1:
            # we encountered end of file...
            print ("Truncating at %d examples (maximum)" % i)
            break
        items.append(Datum(data, DATUM_WIDTH, DATUM_HEIGHT))
    return items

In [6]:
# Opens a file or reads it from the zip archive data.zip
def readlines(filename):
      if(os.path.exists(filename)): 
        return [l[:-1] for l in open(filename).readlines()]
      else: 
        print(os.getcwd())
        z = zipfile.ZipFile('./data.zip')
        liste= z.read(filename).decode("utf-8").split("\n")
        print(len(liste))
        return liste

In [7]:
#   Reads n labels from a file and returns a list of integers.
def loadLabelsFile(filename, n):
    fin = readlines(filename)
    labels = []
    for line in fin[:min(n, len(fin))]:
        if line == '':
            break
        labels.append(int(line))
    return labels

In [8]:
#   Helper function for display purposes.
def asciiGrayscaleConversionFunction(value):
    if(value == 0):
        return ' '
    elif(value == 1):
        return '+'
    elif(value == 2):
        return '#'    

In [9]:
# #   Helper function for file reading.
# def IntegerConversionFunction(character):
#     if(character == ' '):
#         return 0
#     elif(character == '+'):
#         return 1
#     elif(character == '#'):
#         return 2    

In [10]:
#   Helper function for file reading.
def IntegerConversionFunction(character):
    if(character == '+'):
        return 1
    elif(character == '#'):
        return 2
    else:
        return 0

In [11]:
#   Helper function for file reading.
def convertToInteger(data):
    if type(data) != type([]):
        return IntegerConversionFunction(data)
    else:
        return list(map(convertToInteger, data))

In [12]:
def fixFaceDataset(items):
    new_items = copy.deepcopy(items)
    
    list_zeros = [0] * 70
    all_pixels = list()
    
    for i in range(len(new_items)):
        all_pixels = items[i].pixels[0:60]
        
        for j in range(10):
            all_pixels.append(list_zeros)
            
        new_items[i].setPixels(all_pixels)
        
    return new_items

In [13]:
def predictModel(model, data, labels, dataName, modelName):
    
    accuracy = model.score(data, labels) *100
        
    print ("Accuracy of the {} on the {}".format(modelName, dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return accuracy

In [14]:
def getModelPredictions(model, data, labels, dataName, modelName):
    
    prediction = np.array([])
    correct_prediction = 0

    for i in range(len(data)):
        
        # Predict Output
        predicted= model.predict([data[i]])[0]
        prediction = np.append(prediction, predicted)
        
        if predicted == labels[i]:
            correct_prediction += 1
    
    accuracy = correct_prediction*100/len(data)
    
    print ("Accuracy of the {} on the {}".format(modelName, dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return prediction, accuracy

# Digit Dataset

## Training Dataset

In [15]:
digit_training_amount = 5000
digit_training_labels = loadLabelsFile("./data/digitdata/traininglabels", digit_training_amount)
digit_training_items = loadDataFile("./data/digitdata/trainingimages", digit_training_amount,28,28)
digit_training_features= np.array([np.array(item.getPixels()).flatten() for item in digit_training_items])

## Validation Dataset

In [16]:
digit_validation_amount = 1000
digit_validation_labels = loadLabelsFile("./data/digitdata/validationlabels", digit_validation_amount)
digit_validation_items = loadDataFile("./data/digitdata/validationimages", digit_validation_amount, 28, 28)
digit_validation_features= np.array([np.array(item.getPixels()).flatten() for item in digit_validation_items])

## Test Dataset

In [17]:
digit_test_amount = 1000
digit_test_labels = loadLabelsFile("./data/digitdata/testlabels", digit_test_amount)
digit_test_items = loadDataFile("./data/digitdata/testimages", digit_test_amount, 28, 28)
digit_test_features= np.array([np.array(item.getPixels()).flatten() for item in digit_test_items])

# Face dataset

## Training Dataset

In [18]:
face_training_amount = 451
face_training_labels = loadLabelsFile("./data/facedata/facedatatrainlabels", face_training_amount)
face_training_items = fixFaceDataset(loadDataFile("./data/facedata/facedatatrain", face_training_amount,60,70))
face_training_features= np.array([np.array(item.getPixels()).flatten() for item in face_training_items])

## Validation Dataset

In [19]:
face_validation_amount = 301
face_validation_labels = loadLabelsFile("./data/facedata/facedatavalidationlabels", face_validation_amount)
face_validation_items = fixFaceDataset(loadDataFile("./data/facedata/facedatavalidation", face_validation_amount,60,70))
face_validation_features= np.array([np.array(item.getPixels()).flatten() for item in face_validation_items])

## Test Dataset

In [20]:
face_test_amount = 150
face_test_labels = loadLabelsFile("./data/facedata/facedatatestlabels", face_test_amount)
face_test_items = fixFaceDataset(loadDataFile("./data/facedata/facedatatest", face_test_amount,60,70))
face_test_features= np.array([np.array(item.getPixels()).flatten() for item in face_test_items])

# Naive Bayes (NB)

## Digit Dataset

In [21]:
# Create a Gaussian Classifier
digit_NB_model = GaussianNB()

# Train the model using the training sets
digit_NB_model.fit(digit_training_features, digit_training_labels)

In [22]:
# digit_training_prediction = getModelPredictions(digit_NB_model, digit_training_features, digit_training_labels, "Training", "Naive Bayes")[0]

# digit_validation_prediction = getModelPredictions(digit_NB_model, digit_validation_features, 
#                                                 digit_validation_labels, "Validation", "Naive Bayes")[0]

# digit_test_prediction = getModelPredictions(digit_NB_model, digit_test_features, digit_test_labels, "Test", "Naive Bayes")[0]

In [23]:
predictModel(digit_NB_model, digit_training_features, digit_training_labels, "Training", "Naive Bayes")
predictModel(digit_NB_model, digit_validation_features, digit_validation_labels, "Validation", "Naive Bayes")
predictModel(digit_NB_model, digit_test_features, digit_test_labels, "Test", "Naive Bayes")

Accuracy of the Naive Bayes on the Training dataset is 60.50 %
Accuracy of the Naive Bayes on the Validation dataset is 55.60 %
Accuracy of the Naive Bayes on the Test dataset is 50.90 %


50.9

## Face Dataset

In [24]:
# Create a Gaussian Classifier
face_NB_model = GaussianNB()

# Train the model using the training sets
face_NB_model.fit(face_training_features, face_training_labels)

In [25]:
# face_training_prediction = getModelPredictions(face_NB_model, face_training_features, face_training_labels,
#                                                "Training", "Naive Bayes")
# face_validation_prediction = getModelPredictions(face_NB_model, face_validation_features, face_validation_labels,
#                                                  "Validation", "Naive Bayes")
# face_test_prediction = getModelPredictions(face_NB_model, face_test_features, face_test_labels,
#                                            "Test", "Naive Bayes")

In [26]:
predictModel(face_NB_model, face_training_features, face_training_labels, "Training", "Naive Bayes")
predictModel(face_NB_model, face_validation_features, face_validation_labels, "Validation", "Naive Bayes")
predictModel(face_NB_model, face_test_features, face_test_labels, "Test", "Naive Bayes")

Accuracy of the Naive Bayes on the Training dataset is 94.01 %
Accuracy of the Naive Bayes on the Validation dataset is 84.72 %
Accuracy of the Naive Bayes on the Test dataset is 88.00 %


88.0

# KNN (K Nearest Neighbors)

In [27]:
def predictKNN(k,model, data, labels, dataName):
            
    accuracy = model.score(data, labels) * 100
    
    print ("Accuracy of KNN with k={} on the {}".format(k, dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return accuracy

In [28]:
def getPredictionsKNN(k,model, data, labels, dataName):
    
    prediction = np.array([])
    correct_prediction = 0

    for i in range(len(data)):
        
        # Predict Output
        predicted= model.predict([data[i]])[0]
        prediction = np.append(prediction, predicted)
        
        if predicted == labels[i]:
            correct_prediction += 1
        
    accuracy = correct_prediction*100/len(data)    
    
    print ("Accuracy of KNN with k={} on the {}".format(k, dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return prediction, accuracy

## Digit dataset 

In [29]:
# highestfull_k = 2
# highestfull_accuarcy = 0

# highest_k = 2
# highest_accuarcy = 0
# for k in range(2, digit_training_amount):
#     # Create a KNN Classifier
#     digit_KNN_model = KNeighborsClassifier(n_neighbors=k)

#     # Train the model using the training sets
#     digit_KNN_model.fit(digit_training_features, digit_training_labels)

#     acc1 = predictKNN(k, digit_KNN_model, digit_training_features, digit_training_labels, "Training")
#     acc2 = predictKNN(k, digit_KNN_model, digit_validation_features, digit_validation_labels, "Validation")
#     acc3 = predictKNN(k, digit_KNN_model, digit_test_features, digit_test_labels, "Test")
    
#     if (highestfull_accuarcy < acc1+acc2+acc3):
#         highestfull_accuarcy = acc1+acc2+acc3
#         highestfull_k = k
    
#     if (highest_accuarcy < acc2+acc3):
#         highest_accuarcy = acc2 + acc3
#         highest_k = k
    
#     print ("k={} with highest accuarcy of {}/200".format(highest_k, highest_accuarcy))
#     print ("k={} with Full accuarcy of {}/300".format(highestfull_k, highestfull_accuarcy))

#     print("\n \n")

In [30]:
k = 3
# Create a KNN Classifier
digit_KNN_model = KNeighborsClassifier(n_neighbors=k)

# Train the model using the training sets
digit_KNN_model.fit(digit_training_features, digit_training_labels)

In [31]:
# digit_training_prediction = getPredictionsKNN(k, digit_KNN_model, digit_training_features, digit_training_labels, "Training")[0]
# digit_validation_prediction = getPredictionsKNN(k, digit_KNN_model, digit_validation_features, digit_validation_labels, "Validation")[0]
# digit_test_prediction = getPredictionsKNN(k, digit_KNN_model, digit_test_features, digit_test_labels, "Test")[0]

In [32]:
predictKNN(k, digit_KNN_model, digit_training_features, digit_training_labels, "Training")
predictKNN(k, digit_KNN_model, digit_validation_features, digit_validation_labels, "Validation")
predictKNN(k, digit_KNN_model, digit_test_features, digit_test_labels, "Test")

Accuracy of KNN with k=3 on the Training dataset is 97.52 %
Accuracy of KNN with k=3 on the Validation dataset is 91.40 %
Accuracy of KNN with k=3 on the Test dataset is 89.80 %


89.8

## Face dataset

In [33]:
# highestfull_k = 2
# highestfull_accuarcy = 0

# highest_k = 2
# highest_accuarcy = 0
# for k in range(2,face_training_amount):

#     # Create a KNN Classifier
#     face_KNN_model = KNeighborsClassifier(n_neighbors=k)

#     # Train the model using the training sets
#     face_KNN_model.fit(face_training_features, face_training_labels)

    
#     acc1 = predictKNN(k, face_KNN_model, face_training_features, face_training_labels, "Training")
#     acc2 = predictKNN(k, face_KNN_model, face_validation_features, face_validation_labels, "Validation")
#     acc3 = predictKNN(k, face_KNN_model, face_test_features, face_test_labels, "Test")
    
#     if (highestfull_accuarcy < acc1+acc2+acc3):
#         highestfull_accuarcy = acc1+acc2+acc3
#         highestfull_k = k
    
#     if (highest_accuarcy < acc2+acc3):
#         highest_accuarcy = acc2 + acc3
#         highest_k = k
    
#     print ("k={} with highest accuarcy of {}/200".format(highest_k, highest_accuarcy))
#     print ("k={} with Full accuarcy of {}/300".format(highestfull_k, highestfull_accuarcy))

#     print("\n \n")

In [34]:
k = 109

# Create a KNN Classifier
face_KNN_model = KNeighborsClassifier(n_neighbors=k)

# Train the model using the training sets
face_KNN_model.fit(face_training_features, face_training_labels)

In [35]:
# face_training_prediction = getPredictionsKNN(k, face_KNN_model, face_training_features, face_training_labels, "Training")[0]
# face_validation_prediction = getPredictionsKNN(k, face_KNN_model, face_validation_features, face_validation_labels, "Validation")[0]
# face_test_prediction = getPredictionsKNN(k, face_KNN_model, face_test_features, face_test_labels, "Test")[0]

In [36]:
predictKNN(k, face_KNN_model, face_training_features, face_training_labels, "Training")
predictKNN(k, face_KNN_model, face_validation_features, face_validation_labels, "Validation")
predictKNN(k, face_KNN_model, face_test_features, face_test_labels, "Test")

Accuracy of KNN with k=109 on the Training dataset is 84.92 %
Accuracy of KNN with k=109 on the Validation dataset is 81.40 %
Accuracy of KNN with k=109 on the Test dataset is 82.67 %


82.66666666666667

# Multilayer perceptron (MLP)

In [37]:
def predictMLP(model, data, labels, dataName):
    
    accuracy = model.score(data, labels) *100
    
    print ("Accuracy of MLP on the {}".format(dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return accuracy

In [38]:
def getPredictionsMLP(model, data, labels, dataName):
    
    prediction = np.array([])
    correct_prediction = 0

    for i in range(len(data)):
        
        # Predict Output
        predicted = model.predict([data[i]])[0]
        prediction = np.append(prediction, predicted)
        
        if predicted == labels[i]:
            correct_prediction += 1
        
    accuracy = correct_prediction*100/len(data)    
    
    print ("Accuracy of MLP on the {}".format(dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return  prediction, accuracy

## Digit dataset

In [39]:
# Create the MLP classifier
digit_MLP_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=40, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=0,
                    learning_rate_init=0.1275)

# Train the model using the training sets
digit_MLP_model.fit(digit_training_features, digit_training_labels)

Iteration 1, loss = 0.74036384
Iteration 2, loss = 0.21768926
Iteration 3, loss = 0.13165975
Iteration 4, loss = 0.08354227
Iteration 5, loss = 0.05691779
Iteration 6, loss = 0.03797812
Iteration 7, loss = 0.02407969
Iteration 8, loss = 0.01730209
Iteration 9, loss = 0.01217340
Iteration 10, loss = 0.00826997
Iteration 11, loss = 0.00699410
Iteration 12, loss = 0.00552887
Iteration 13, loss = 0.00472022
Iteration 14, loss = 0.00413585
Iteration 15, loss = 0.00366910
Iteration 16, loss = 0.00334101
Iteration 17, loss = 0.00298408
Iteration 18, loss = 0.00276163
Iteration 19, loss = 0.00256634
Iteration 20, loss = 0.00238440
Iteration 21, loss = 0.00227683
Iteration 22, loss = 0.00207258
Iteration 23, loss = 0.00197036
Iteration 24, loss = 0.00187003
Iteration 25, loss = 0.00177867
Iteration 26, loss = 0.00167135
Iteration 27, loss = 0.00161868
Iteration 28, loss = 0.00152885
Iteration 29, loss = 0.00146004
Iteration 30, loss = 0.00140760
Iteration 31, loss = 0.00135502
Iteration 32, los

In [40]:
# digit_training_prediction = getPredictionsMLP(digit_MLP_model, digit_training_features, digit_training_labels, "Training")[0]
# digit_validation_prediction = getPredictionsMLP(digit_MLP_model, digit_validation_features, digit_validation_labels, "Validation")[0]
# digit_test_prediction = getPredictionsMLP(digit_MLP_model, digit_test_features, digit_test_labels, "Test")[0]

In [41]:
predictMLP(digit_MLP_model, digit_training_features, digit_training_labels, "Training")
predictMLP(digit_MLP_model, digit_validation_features, digit_validation_labels, "Validation")
predictMLP(digit_MLP_model, digit_test_features, digit_test_labels, "Test")

Accuracy of MLP on the Training dataset is 100.00 %
Accuracy of MLP on the Validation dataset is 93.80 %
Accuracy of MLP on the Test dataset is 92.60 %


92.60000000000001

## Face dataset

In [42]:
# Create the MLP classifier
face_MLP_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=25, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=0.071)

# Train the model using the training sets
face_MLP_model.fit(face_training_features, face_training_labels)

Iteration 1, loss = 0.95257749
Iteration 2, loss = 0.56092845
Iteration 3, loss = 0.42020622
Iteration 4, loss = 0.19147887
Iteration 5, loss = 0.06426350
Iteration 6, loss = 0.01822114
Iteration 7, loss = 0.00507939
Iteration 8, loss = 0.00185395
Iteration 9, loss = 0.00085409
Iteration 10, loss = 0.00051271
Iteration 11, loss = 0.00036532
Iteration 12, loss = 0.00028282
Iteration 13, loss = 0.00023805
Iteration 14, loss = 0.00020952
Iteration 15, loss = 0.00018741
Iteration 16, loss = 0.00017552
Iteration 17, loss = 0.00016481
Iteration 18, loss = 0.00015684
Iteration 19, loss = 0.00015166
Iteration 20, loss = 0.00014738
Iteration 21, loss = 0.00014401
Iteration 22, loss = 0.00014136
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


In [43]:
# face_training_prediction = getPredictionsMLP(face_MLP_model, face_training_features, face_training_labels, "Training")[0]
# face_validation_prediction = getPredictionsMLP(face_MLP_model, face_validation_features, face_validation_labels, "Validation")[0]
# face_test_prediction = getPredictionsMLP(face_MLP_model, face_test_features, face_test_labels, "Test")[0]

In [44]:
predictMLP(face_MLP_model, face_training_features, face_training_labels, "Training")
predictMLP(face_MLP_model, face_validation_features, face_validation_labels, "Validation")
predictMLP(face_MLP_model, face_test_features, face_test_labels, "Test")

Accuracy of MLP on the Training dataset is 100.00 %
Accuracy of MLP on the Validation dataset is 90.03 %
Accuracy of MLP on the Test dataset is 92.00 %


92.0

# Support Vector Machine (SVM)

In [45]:
def predictSVM(model, data, labels, dataName):
    
    accuracy = model.score(data, labels) *100
    
    print ("Accuracy of SVM on the {}".format(dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return accuracy

In [46]:
def getPredictionsSVM(model, data, labels, dataName):
    
    prediction = np.array([])
    correct_prediction = 0

    for i in range(len(data)):
        
        # Predict Output
        predicted = model.predict([data[i]])[0]
        prediction = np.append(prediction, predicted)
        
        if predicted == labels[i]:
            correct_prediction += 1
        
    accuracy = correct_prediction*100/len(data)    
    
    print ("Accuracy of SVM on the {}".format(dataName),
           "dataset is {0:.2f} %".format(accuracy))
    
    return  prediction, accuracy

## Digit dataset

In [47]:
# Create the SVM classifier
digit_SVM_model = SVC(kernel='rbf', gamma='auto')

# Train the model using the training sets
digit_SVM_model.fit(digit_training_features, digit_training_labels)

In [48]:
# digit_training_prediction = getPredictionsSVM(digit_SVM_model, digit_training_features, digit_training_labels, "Training")[0]
# digit_validation_prediction = getPredictionsSVM(digit_SVM_model, digit_validation_features, digit_validation_labels, "Validation")[0]
# digit_test_prediction = getPredictionsSVM(digit_SVM_model, digit_test_features, digit_test_labels, "Test")[0]

In [49]:
predictSVM(digit_SVM_model, digit_training_features, digit_training_labels, "Training")
predictSVM(digit_SVM_model, digit_validation_features, digit_validation_labels, "Validation")
predictSVM(digit_SVM_model, digit_test_features, digit_test_labels, "Test")

Accuracy of SVM on the Training dataset is 96.40 %
Accuracy of SVM on the Validation dataset is 91.50 %
Accuracy of SVM on the Test dataset is 90.30 %


90.3

## Face dataset

In [50]:
# Create the SVM classifier
face_SVM_model = SVC(kernel='rbf', gamma='auto')

# Train the model using the training sets
face_SVM_model.fit(face_training_features, face_training_labels)

predictSVM(face_SVM_model, face_training_features, face_training_labels, "Training")
predictSVM(face_SVM_model, face_validation_features, face_validation_labels, "Validation")
predictSVM(face_SVM_model, face_test_features, face_test_labels, "Test")

Accuracy of SVM on the Training dataset is 99.78 %
Accuracy of SVM on the Validation dataset is 91.69 %
Accuracy of SVM on the Test dataset is 92.00 %


92.0

In [51]:
# face_training_prediction = getPredictionsSVM(face_SVM_model, face_training_features, face_training_labels, "Training")[0]
# face_validation_prediction = getPredictionsSVM(face_SVM_model, face_validation_features, face_validation_labels, "Validation")[0]
# face_test_prediction = getPredictionsSVM(face_SVM_model, face_test_features, face_test_labels, "Test")[0]

In [52]:
predictSVM(face_SVM_model, face_training_features, face_training_labels, "Training")
predictSVM(face_SVM_model, face_validation_features, face_validation_labels, "Validation")
predictSVM(face_SVM_model, face_test_features, face_test_labels, "Test")

Accuracy of SVM on the Training dataset is 99.78 %
Accuracy of SVM on the Validation dataset is 91.69 %
Accuracy of SVM on the Test dataset is 92.00 %


92.0

# Decision Tree (DT)

## Digit dataset

In [83]:
# Create the Decision Tree classifier
digit_DT_model = DecisionTreeClassifier()

# Train the model using the training sets
digit_DT_model.fit(digit_training_features, digit_training_labels)

predictModel(digit_DT_model, digit_training_features, digit_training_labels, "Training", "Decision Tree")
predictModel(digit_DT_model, digit_validation_features, digit_validation_labels, "Validation", "Decision Tree")
predictModel(digit_DT_model, digit_test_features, digit_test_labels, "Test", "Decision Tree")

Accuracy of the Decision Tree on the Training dataset is 100.00 %
Accuracy of the Decision Tree on the Validation dataset is 74.80 %
Accuracy of the Decision Tree on the Test dataset is 73.00 %


73.0

In [84]:
# params =  {
#     'max_depth': list(range(1, 51)) + ['None'],
#     'max_features': list(range(1, 51)) + ['None']
# }
# n_jobs= os.cpu_count()
# grid = GridSearchCV(estimator=DecisionTreeClassifier(),
#                     param_grid=params,
#                     cv=10,
#                     n_jobs= n_jobs,
#                     verbose=2)

In [85]:
# grid.fit(digit_training_features, digit_training_labels)

In [86]:
# grid.best_score_, grid.best_params_

In [58]:
# digit_training_prediction = getModelPredictions(digit_DT_model, digit_training_features, digit_training_labels, "Training", "Decision Tree")[0]
# digit_validation_prediction = getModelPredictions(digit_DT_model, digit_validation_features, digit_validation_labels, "Validation", "Decision Tree")[0]
# digit_test_prediction = getModelPredictions(digit_DT_model, digit_test_features, digit_test_labels, "Test", "Decision Tree")[0]

In [59]:
predictModel(digit_DT_model, digit_training_features, digit_training_labels, "Training", "Decision Tree")
predictModel(digit_DT_model, digit_validation_features, digit_validation_labels, "Validation", "Decision Tree")
predictModel(digit_DT_model, digit_test_features, digit_test_labels, "Test", "Decision Tree")

Accuracy of the Decision Tree on the Training dataset is 70.20 %
Accuracy of the Decision Tree on the Validation dataset is 64.70 %
Accuracy of the Decision Tree on the Test dataset is 60.40 %


60.4

## Face dataset

In [87]:
# Create the Decision Tree classifier
face_DT_model = DecisionTreeClassifier()

# Train the model using the training sets
face_DT_model.fit(face_training_features, face_training_labels)

In [61]:
# face_training_prediction = getModelPredictions(face_DT_model, face_training_features, face_training_labels, "Training", "Decision Tree")[0]
# face_validation_prediction = getModelPredictions(face_DT_model, face_validation_features, face_validation_labels, "Validation", "Decision Tree")[0]
# face_test_prediction = getModelPredictions(face_DT_model, face_test_features, face_test_labels, "Test", "Decision Tree")[0]

In [88]:
predictModel(face_DT_model, face_training_features, face_training_labels, "Training", "Decision Tree")
predictModel(face_DT_model, face_validation_features, face_validation_labels, "Validation", "Decision Tree")
predictModel(face_DT_model, face_test_features, face_test_labels, "Test", "Decision Tree")

Accuracy of the Decision Tree on the Training dataset is 100.00 %
Accuracy of the Decision Tree on the Validation dataset is 75.42 %
Accuracy of the Decision Tree on the Test dataset is 70.00 %


70.0

In [63]:
# n = 4
# print(face_training_items[n])
# print(len(face_training_items))

# #Predict Output
# predicted= face_NB_model.predict([face_training_features[n]])[0]
# print ("Predicted Value:", predicted)

In [64]:
# n = 4
# print(face_validation_items[n])
# print(len(face_validation_items))

# #Predict Output
# predicted= face_NB_model.predict([face_validation_features[n]])[0]
# print ("Predicted Value:", predicted)

In [65]:
# n = 16
# print(face_test_items[n])
# print(len(face_test_items))

# #Predict Output
# predicted= face_NB_model.predict([face_test_features[n]])[0]
# print ("Predicted Value:", predicted)

In [66]:
# n = 2
# print(digit_training_items[n])
# print(len(digit_training_items))

# #Predict Output
# predicted= digit_NB_model.predict([digit_training_features[n]])[0]
# print ("Predicted Value:", predicted)

In [67]:
# n = 1
# print(digit_validation_items[n])
# print(len(digit_validation_items))

# #Predict Output
# predicted= digit_NB_model.predict([digit_validation_features[n]])[0]
# print ("Predicted Value:", predicted)

In [68]:
# n = 4
# print(digit_test_items[n])
# print(len(digit_test_items))

# # Predict Output
# predicted= digit_NB_model.predict([digit_test_features[n]])[0]
# print ("Predicted Value:", predicted)

In [69]:
# pca_reduction = PCA(n_components=1)
# pca_reduction.fit(training_features)
# training_features_pca = pca_reduction.transform(training_features)
# new_training_features = pca_reduction.inverse_transform(training_features_pca)
# training_prediction = predictNaiveBayes(new_training_features, training_labels, "Training")

In [70]:
# def _test():
#     doctest.testmod() # Test the interactive sessions in function comments
    
#     n = 451 # face data limit
#     items = fixFaceDataset(loadDataFile("./data/facedata/facedatatrain", n,60,70))
#     labels = loadLabelsFile("./data/facedata/facedatatrainlabels", n)

# #     n = 5000 # digit data limit
# #     items = loadDataFile("./data/digitdata/trainingimages", n,28,28)
# #     labels = loadLabelsFile("./data/digitdata/traininglabels", n)

    
# #     for i in range(1):
# #         print (items[i].getAsciiString())
# #         print (items[i])
# #         print (items[i].height)
# #         print (items[i].width)
# #         print (dir(items[i]))
# #         print ((items[i].getPixels()))
# #         print (len(items[i].getPixels()))

    
# _test()

In [71]:
# def pca_feature(feature):
# #     print(feature)
#     pca_reduction = PCA(n_components=1)
#     pca_reduction.fit(feature)
#     feature_pca = pca_reduction.transform(feature)
#     print(feature_pca.shape)
# #     new_feature = pca_reduction.inverse_transform(feature_pca)
# #     print(new_feature)
#     return feature_pca