In [96]:
import sys
import math
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import LogNorm
import collections

# Laplace smoothing constant
k = 0.13 # 0.13 is best k I've found for part1.1
sys.setrecursionlimit(1500)

In [97]:
# convert input txt into an 140000 * 28 array
def inputDigit(name="digitdata/trainingimages"):
    with open(name) as digitTxt:
        image = [list(line)[0:28] for line in digitTxt]
        rows = len(image)
    return image

In [98]:
digitImage = inputDigit()
print(len(digitImage))
print(len(digitImage[139999]))

140000
28


In [99]:
def readRough(name="digitdata/traininglabels"):
    with open(name) as label:
        labels = []
        for line in label:
            labels.append(line[0])
    return labels

In [100]:
label = readRough()
print (len(label))
class_ = sorted(collections.Counter(label).items())
with open('labelstats.txt', 'wb') as fp:
    pickle.dump(class_, fp)

5000


In [101]:
def is_foreground(image, row, col):
    if(col%27 == 0 or col%28 == 27 or row%27 == 0 or row%28 == 27):
        return False
    else:
        if image[row-1][col] != ' ' and image[row][col] != ' ' and image[row+1][col] != ' ':
            return True
        if image[row][col-1] != ' ' and image[row][col] != ' ' and image[row][col+1] != ' ':
            return True
        if image[row-1][col-1] != ' ' and image[row][col] != ' ' and image[row+1][col+1] != ' ':
            return True
        if image[row+1][col-1] != ' ' and image[row][col] != ' ' and image[row-1][col+1] != ' ':
            #print(row+1)
            return True

        return False

In [102]:
# load the number of each digit in the training set
with open ('labelstats.txt', 'rb') as fp:
    class_ = pickle.load(fp)
    
# Training
# P(Fij = 1 | class) = (# of times pixel (i,j) has value 1 in training examples from this class) / (Total # of training examples from this class)
# P(Fij = 0 | class) = (# of times pixel (i,j) has value 0 in training examples from this class) / (Total # of training examples from this class)
digitImage = inputDigit()
training0=np.zeros(shape=(10, 28, 28))
training1=np.zeros(shape=(10, 28, 28))

count = 0
for i in range(5000):
    for row in range(28):
        for col in range(28):
            if is_foreground(digitImage, i*28 + row, col):
                training1[int(label[i])][row][col] += 1
                training0[int(label[i])][row][col] += 0
            else:
                training1[int(label[i])][row][col] += 0
                training0[int(label[i])][row][col] += 1

for i in range(10):
    training1[i] = (training1[i] + k) / (class_[i][1] + k * 2)
    training0[i] = (training0[i] + k) / (class_[i][1] + k * 2)
    
#print(training0[0][0])
#print(training1[0][0])




In [103]:
# MAP classification
# posterior probability = log P(class) + log P(f1,1 | class) + log P(f1,2 | class) + ... + log P(f28,28 | class)

test_rough = inputDigit(name = "digitdata/testimages")
answer = np.zeros(1000)

for i in range(1000):
    test_image = np.zeros(shape=(28,28))
    for row in range(28):
        for col in range(28):
            if is_foreground(test_rough, i*28 + row, col):
                test_image[row][col] = 1
            else:
                test_image[row][col] = 0
             
    posteriori = np.zeros(10)
    for class_num in range(10):
        posteriori[class_num] = math.log(class_[class_num][1])
        for row in range(28):
            for col in range(28):
                if test_image[row][col] == 0:
                    posteriori[class_num] += math.log(training0[class_num][row][col])
                else:
                    posteriori[class_num] += math.log(training1[class_num][row][col] )   
    answer[i] = np.argmax(posteriori)
    #print(posteriori)
    
#print(answer)

In [104]:
testlabels = readRough("digitdata/testlabels")
testclass_ = sorted(collections.Counter(testlabels).items())

In [105]:
def confusion_matrix():
    conf_matrix = np.zeros(shape=(10,10))
    for i in range(1000):
        conf_matrix[int(testlabels[i])][int(answer[i])] += 1
            
    for i in range(10):
        for j in range(10):
            conf_matrix[i][j] /= testclass_[i][1]
    
    return conf_matrix

In [106]:
# print confusion matrix
conf_matrix = confusion_matrix()
conf_matrix = np.around(conf_matrix, 2)
for row in conf_matrix:
    print(row)
overall_accuracy = 0
for i in range(10):
    overall_accuracy += conf_matrix[i][i] * testclass_[i][1]
print("overall_accuracy = " + str(overall_accuracy/1000))

[ 0.87  0.    0.01  0.    0.    0.03  0.04  0.    0.04  0.  ]
[ 0.    0.95  0.    0.    0.    0.02  0.01  0.    0.02  0.  ]
[ 0.02  0.04  0.74  0.07  0.01  0.    0.06  0.01  0.04  0.02]
[ 0.    0.02  0.    0.77  0.    0.07  0.02  0.05  0.03  0.04]
[ 0.    0.01  0.    0.    0.71  0.01  0.03  0.    0.02  0.22]
[ 0.01  0.01  0.    0.1   0.04  0.73  0.02  0.01  0.02  0.05]
[ 0.01  0.07  0.01  0.    0.04  0.03  0.81  0.    0.02  0.  ]
[ 0.    0.07  0.03  0.    0.03  0.    0.    0.75  0.03  0.1 ]
[ 0.02  0.03  0.02  0.1   0.02  0.06  0.01  0.02  0.65  0.08]
[ 0.01  0.01  0.    0.03  0.08  0.01  0.    0.02  0.01  0.83]
overall_accuracy = 0.78041
