# Imports

In [1]:
%matplotlib inline

import util
import numpy as np
import zipfile
import os
import doctest
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# Constants

In [2]:
DATUM_WIDTH = 0 # in pixels
DATUM_HEIGHT = 0 # in pixels

# Module Classes

In [3]:
# A datum is a pixel-level encoding of digits or face/non-face edge maps.

# Digits are from the MNIST dataset and face images are from the 
# easy-faces and background categories of the Caltech 101 dataset.


# Each digit is 28x28 pixels, and each face/non-face image is 60x74 
# pixels, each pixel can take the following values:
# 0: no edge (blank)
# 1: gray pixel (+) [used for digits only]
# 2: edge [for face] or black pixel [for digit] (#)

# Pixel data is stored in the 2-dimensional array pixels, which
# maps to pixels on a plane according to standard euclidean axes
# with the first dimension denoting the horizontal and the second
# the vertical coordinate:

# 28 # # # #      #  #
# 27 # # # #      #  #
#  .
#  .
#  .
#  3 # # + #      #  #
#  2 # # # #      #  #
#  1 # # # #      #  #
#  0 # # # #      #  #
#    0 1 2 3 ... 27 28

# For example, the + in the above diagram is stored in pixels[2][3], or
# more generally pixels[column][row].

# The contents of the representation can be accessed directly
# via the getPixel and getPixels methods.

In [4]:
class Datum:
    # Create a new datum from file input (standard MNIST encoding).
    def __init__(self, data, width, height):
        DATUM_HEIGHT = height
        DATUM_WIDTH= width
        self.height = DATUM_HEIGHT
        self.width = DATUM_WIDTH
        if data == None:
              data = [[' ' for i in range(DATUM_WIDTH)] for j in range(DATUM_HEIGHT)] 
        self.pixels = util.arrayInvert(convertToInteger(data)) 

    # Returns the value of the pixel at column, row as 0, or 1.
    def getPixel(self, column, row):
        return self.pixels[column][row]

    # Returns all pixels as a list of lists.
    def getPixels(self):
        return self.pixels    

    # Renders the data item as an ascii image.
    def getAsciiString(self):
        rows = []
        data = util.arrayInvert(self.pixels)
        for row in data:
            ascii = map(asciiGrayscaleConversionFunction, row)
            rows.append( "".join(ascii))
        return "\n".join(rows)

    def __str__(self):
        return self.getAsciiString()

# Data processing, cleanup and display functions

In [5]:
#   Reads n data images from a file and returns a list of Datum objects.
#   (Return less then n items if the end of file is encountered).
def loadDataFile(filename, n,width,height): 
    DATUM_WIDTH =width
    DATUM_HEIGHT=height
    fin = readlines(filename)
    fin.reverse()
    items = []
    for i in range(n):
        data = []
        for j in range(height):
            data.append(list(fin.pop()))
        if len(data[0]) < DATUM_WIDTH-1:
            # we encountered end of file...
            print ("Truncating at %d examples (maximum)" % i)
            break
        items.append(Datum(data, DATUM_WIDTH, DATUM_HEIGHT))
    return items

In [6]:
# Opens a file or reads it from the zip archive data.zip
def readlines(filename):
      if(os.path.exists(filename)): 
        return [l[:-1] for l in open(filename).readlines()]
      else: 
        print(os.getcwd())
        z = zipfile.ZipFile('./data.zip')
        liste= z.read(filename).decode("utf-8").split("\n")
        print(len(liste))
        return liste

In [7]:
#   Reads n labels from a file and returns a list of integers.
def loadLabelsFile(filename, n):
    fin = readlines(filename)
    labels = []
    for line in fin[:min(n, len(fin))]:
        if line == '':
            break
        labels.append(int(line))
    return labels

In [8]:
#   Helper function for display purposes.
def asciiGrayscaleConversionFunction(value):
    if(value == 0):
        return ' '
    elif(value == 1):
        return '+'
    elif(value == 2):
        return '#'    

In [9]:
#   Helper function for file reading.
def IntegerConversionFunction(character):
    if(character == ' '):
        return 0
    elif(character == '+'):
        return 1
    elif(character == '#'):
        return 2    

In [10]:
#   Helper function for file reading.
def convertToInteger(data):
    if type(data) != type([]):
        return IntegerConversionFunction(data)
    else:
        return list(map(convertToInteger, data))

# Naive Bayes Classifier

In [11]:
def _test():
    doctest.testmod() # Test the interactive sessions in function comments
    
#     n = 451 # face data limit
#     items = loadDataFile("./data/facedata/facedatatrain", n,60,70)
#     labels = loadLabelsFile("./data/facedata/facedatatrainlabels", n)

    n = 5000 # digit data limit
    items = loadDataFile("./data/digitdata/trainingimages", n,28,28)
    labels = loadLabelsFile("./data/digitdata/traininglabels", n)

    for i in range(1):
#         print (items[i].getAsciiString())
        print (items[i])
        print (items[i].height)
        print (items[i].width)
        print (dir(items[i]))
        print (items[i].getPixels())
        print (len(items[i].getPixels()))
    
_test()

                            
                            
                            
                            
                            
                +++++##+    
        +++++######+###+    
       +##########+++++     
        #######+##          
        +++###  ++          
           +#+              
           +#+              
            +#+             
            +##++           
             +###++         
              ++##++        
                +##+        
                 ###+       
              +++###        
            ++#####+        
          ++######+         
        ++######+           
       +######+             
    ++######+               
    +####++                 
                            
                            
                            
28
28
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', 

In [12]:
# def pca_feature(feature):
# #     print(feature)
#     pca_reduction = PCA(n_components=1)
#     pca_reduction.fit(feature)
#     feature_pca = pca_reduction.transform(feature)
#     print(feature_pca.shape)
# #     new_feature = pca_reduction.inverse_transform(feature_pca)
# #     print(new_feature)
#     return feature_pca

In [13]:
training_amount = 5000
training_labels = loadLabelsFile("./data/digitdata/traininglabels", training_amount)
training_items = loadDataFile("./data/digitdata/trainingimages", training_amount,28,28)
training_features= np.array([np.array(item.getPixels()).flatten() for item in training_items])
# print (labels)

In [14]:
print(len(training_features))
print(len(training_labels))

5000
5000


In [15]:
print ((training_features[1]))
# print (training_features.shape[1])
# print (training_features[1])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 2 2 2 2 2 2 2 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 2 2 2 2 1 2 2 2 2 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 2 2 1 1 1 0 0 0 1 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 2 1 0 0
 0 0 0 0 1 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 2 1 0 0 0 0 0 0 0 1 2 2 2 0
 0 0 0 0 0 0 0 0 0 0 1 2 2 2 2 0 0 0 0 0 0 0 0 1 2 2 1 0 0 0 0 0 0 0 0 0 0
 1 2 2 2 1 1 1 0 0 0 0 0 0 0 1 2 2 1 0 0 0 0 0 0 0 0 0 1 2 2 2 1 0 0 0 0 0
 0 0 0 0 0 2 2 1 0 0 0 0 0 0 0 0 0 1 2 2 2 2 1 1 0 0 0 0 0 0 0 0 1 2 2 0 0
 0 0 0 0 0 0 0 0 1 2 2 2 2 2 1 0 0 0 0 0 0 0 1 2 2 1 0 0 0 0 0 0 0 0 0 0 2
 2 2 2 2 2 1 0 0 0 0 0 0 

In [16]:
#Create a Gaussian Classifier
# model = MultinomialNB()
model = GaussianNB()

# Train the model using the training sets
model.fit(training_features,training_labels)

## Validation Test

In [17]:
def predictNaiveBayes(data, labels, dataName):
    
    prediction = np.array([])
    correct_prediction = 0

    for i in range(len(data)):
        
        # Predict Output
        predicted= model.predict([data[i]])[0]
        np.append(prediction, predicted)
        
        if predicted == labels[i]:
            correct_prediction += 1
        
    print ("Accuracy of the Naive Bayes on the {} dataset is {} %".format(dataName, correct_prediction*100/len(data)))
    
    return prediction

In [18]:
validation_amount = 1000
validation_labels = loadLabelsFile("./data/digitdata/validationlabels", validation_amount)
validation_items = loadDataFile("./data/digitdata/validationimages", validation_amount, 28, 28)
validation_features= np.array([np.array(item.getPixels()).flatten() for item in validation_items])

In [19]:
test_amount = 1000
test_labels = loadLabelsFile("./data/digitdata/testlabels", validation_amount)
test_items = loadDataFile("./data/digitdata/testimages", validation_amount, 28, 28)
test_features= np.array([np.array(item.getPixels()).flatten() for item in validation_items])

In [20]:
training_prediction = predictNaiveBayes(training_features, training_labels, "Training")
validation_prediction = predictNaiveBayes(validation_features,validation_labels, "Validation")
test_prediction = predictNaiveBayes(test_features,test_labels, "Test")

Accuracy of the Naive Bayes on the Training dataset is 60.5 %
Accuracy of the Naive Bayes on the Validation dataset is 55.6 %
Accuracy of the Naive Bayes on the Test dataset is 9.3 %


In [21]:
n = 0
print(test_items[n])
print(len(test_items))

# Predict Output
predicted= model.predict([test_features[n]])[0]
print ("Predicted Value:", predicted)

                            
                            
                            
                            
                            
                            
                            
             ++###+         
             ######+        
            +######+        
            ##+++##+        
           +#+  +##+        
           +##++###+        
           +#######+        
           +#######+        
            +##+###         
              ++##+         
              +##+          
              ###+          
            +###+           
            +##+            
           +##+             
          +##+              
         +##+               
         ##+                
        +#+                 
        +#+                 
                            
1000
Predicted Value: 9


In [22]:
# n = 2
# print(training_items[n])
# print(len(training_items))

# #Predict Output
# predicted= model.predict([training_features[n]])[0]
# print ("Predicted Value:", predicted)

In [23]:
# n = 1
# print(validation_items[n])
# print(len(validation_items))

# #Predict Output
# predicted= model.predict([validation_features[n]])[0]
# print ("Predicted Value:", predicted)

In [24]:
# pca_reduction = PCA(n_components=1)
# pca_reduction.fit(training_features)
# training_features_pca = pca_reduction.transform(training_features)
# new_training_features = pca_reduction.inverse_transform(training_features_pca)
# training_prediction = predictNaiveBayes(new_training_features, training_labels, "Training")