In [293]:
import sys
import math
import pickle
import numpy as np
import collections

k = 0.3    # Laplace smoothing constant
n = 3       # number of pixel in one column in a feature
m = 3       # number of pixel in one row in a feature

sys.setrecursionlimit(1500)

In [294]:
def inputDigit(name="digitdata/trainingimages"):
    with open(name) as digitTxt:
        image = [list(line)[0:28] for line in digitTxt]
        rows = len(image)
    return image

In [295]:
def readRough(name="digitdata/traininglabels"):
    with open(name) as label:
        labels = []
        for line in label:
            labels.append(line[0])
    return labels


In [296]:
digitImage = inputDigit()
label = readRough()
#print ("len(label): " + str(len(label)))
class_ = sorted(collections.Counter(label).items())
#print (class_)
with open('labelstats.txt', 'wb') as fp:
    pickle.dump(class_, fp)

In [297]:
# load the number of each digit in the training set
with open ('labelstats.txt', 'rb') as fp:
    class_ = pickle.load(fp)

digitImage = inputDigit()

# training shape=(10 classes * (28-n+1)*(28-m+1) features * 2^nm * 2 feature values)
training=np.zeros(shape=(10, 28-n+1, 28-m+1, 2**(n*m), 2**(n*m)))

count = 0
for i in range(5000):                  # for each image
    for row in range(28-n+1):
        for col in range(28-m+1):      # for each feature
            
            feature_val0 = 0
            feature_val1 = 0
            for y in range(n):
                for x in range(m):      # for each pixel in feature (overlapping)
                    # calculate feature_val
                    if digitImage[i*28+row+y][col+x] == '#':
                        feature_val0 += 2**(m*y+x)
                    elif digitImage[i*28+row+y][col+x] == '+':
                        feature_val1 += 2**(m*y+x)
            training[int(label[i])][row][col][feature_val0][feature_val1] += 1

for i in range(10):
    training[i] = (training[i] + k) / (class_[i][1] + k * 3**(m*n))

#print(training[0][0].shape)
#print(training[0][0])



In [298]:
# MAP classification
# posterior probability = log P(class) + log P(f1,1 | class) + log P(f1,2 | class) + ... + log P(f28,28 | class)

test_rough = inputDigit(name = "digitdata/testimages")
answer = np.zeros(1000)


for i in range(1000):                  # for each image
    test_image = np.zeros(shape=(28-n+1, 28-m+1, 2))
    for row in range(28-n+1):
        for col in range(28-m+1):      # for each feature
            
            feature_val0 = 0
            feature_val1 = 0
            for y in range(n):
                for x in range(m):      # for each pixel in feature (overlapping)
                    # calculate feature_val
                    if test_rough[i*28+row+y][col+x] == '#':
                        feature_val0 += 2**(m*y+x)
                    elif test_rough[i*28+row+y][col+x] == '+':
                        feature_val1 += 2**(m*y+x)
                        
            test_image[row][col][0] = feature_val0
            test_image[row][col][1] = feature_val1
       
    posteriori = np.zeros(10)
    for class_num in range(10):
        posteriori[class_num] = math.log(class_[class_num][1])
        for row in range(28-n+1):
            for col in range(28-m+1):
                posteriori[class_num] += math.log(training[class_num][row][col][int(test_image[row][col][0])][int(test_image[row][col][1])])
    answer[i] = np.argmax(posteriori)
    #print(posteriori)
    
#print(answer)



In [299]:
testlabels = readRough("digitdata/testlabels")
testclass_ = sorted(collections.Counter(testlabels).items())

In [300]:
def confusion_matrix():
    conf_matrix = np.zeros(shape=(10,10))
    for i in range(1000):
        conf_matrix[int(testlabels[i])][int(answer[i])] += 1
            
    for i in range(10):
        for j in range(10):
            conf_matrix[i][j] /= testclass_[i][1]
    
    return conf_matrix

In [301]:
conf_matrix = confusion_matrix()
conf_matrix = np.around(conf_matrix, 2)
for row in conf_matrix:
    print(row)
overall_accuracy = 0
for i in range(10):
    overall_accuracy += conf_matrix[i][i] * class_[i][1]
print("overall_accuracy = " + str(overall_accuracy/5000))

[ 0.94  0.    0.    0.    0.01  0.    0.02  0.01  0.01  0.  ]
[ 0.    0.99  0.    0.    0.    0.    0.01  0.    0.    0.  ]
[ 0.01  0.14  0.73  0.03  0.02  0.    0.02  0.04  0.02  0.  ]
[ 0.    0.03  0.    0.9   0.    0.    0.01  0.06  0.    0.  ]
[ 0.    0.03  0.    0.    0.94  0.    0.02  0.    0.    0.01]
[ 0.02  0.02  0.01  0.22  0.03  0.54  0.03  0.05  0.03  0.03]
[ 0.01  0.05  0.    0.    0.04  0.01  0.87  0.01  0.    0.  ]
[ 0.    0.1   0.01  0.    0.01  0.    0.    0.87  0.    0.01]
[ 0.02  0.17  0.04  0.15  0.05  0.    0.02  0.03  0.52  0.01]
[ 0.01  0.02  0.    0.02  0.09  0.    0.    0.1   0.    0.76]
overall_accuracy = 0.815128
