In [1]:
import sys
import math
import pickle
import numpy as np

k = 0.05    # Laplace smoothing constant
n = 3       # number of pixel in one column in a feature
m = 2       # number of pixel in one row in a feature

sys.setrecursionlimit(1500)

In [2]:
def inputDigit(name="digitdata/trainingimages"):
    with open(name) as digitTxt:
        image = [list(line)[0:28] for line in digitTxt]
        rows = len(image)
    return image

In [3]:
digitImage = inputDigit()

In [4]:
import collections

def readRough(name="digitdata/traininglabels"):
    with open(name) as label:
        labels = []
        for line in label:
            labels.append(line[0])
    return labels


In [5]:
label = readRough()
print ("len(label): " + str(len(label)))
class_ = sorted(collections.Counter(label).items())
#print (class_)
with open('labelstats.txt', 'wb') as fp:
    pickle.dump(class_, fp)

len(label): 5000


In [6]:
# load the number of each digit in the training set
with open ('labelstats.txt', 'rb') as fp:
    class_ = pickle.load(fp)

digitImage = inputDigit()

# training shape=(10 classes * (28-n+1)*(28-m+1) features * 2^nm feature values)
training=np.zeros(shape=(10, 28-n+1, 28-m+1, 2**(n*m)))

count = 0
for i in range(5000):                  # for each image
    for row in range(28-n+1):
        for col in range(28-m+1):      # for each feature
            
            feature_val = 0
            for y in range(n):
                for x in range(m):      # for each pixel in feature (overlapping)
                    # calculate feature_val
                    if digitImage[i*28+row+y][col+x] != ' ':
                        feature_val += 2**(m*y+x)
            training[int(label[i])][row][col][feature_val] += 1

for i in range(10):
    training[i] = (training[i] + k) / (class_[i][1] + k * 2**(m*n))

#print(training[0][0].shape)
#print(training[0][0])



In [7]:
# MAP classification
# posterior probability = log P(class) + log P(f1,1 | class) + log P(f1,2 | class) + ... + log P(f28,28 | class)

test_rough = inputDigit(name = "digitdata/testimages")
answer = np.zeros(1000)


for i in range(1000):                  # for each image
    test_image = np.zeros(shape=(28-n+1, 28-m+1))
    for row in range(28-n+1):
        for col in range(28-m+1):      # for each feature
            
            feature_val = 0
            for y in range(n):
                for x in range(m):      # for each pixel in feature (overlapping)
                    # calculate feature_val
                    if test_rough[i*28+row+y][col+x] != ' ':
                        feature_val += 2**(m*y+x)
                        
            test_image[row][col] = feature_val
       
    posteriori = np.zeros(10)
    for class_num in range(10):
        posteriori[class_num] = math.log(class_[class_num][1])
        for row in range(28-n+1):
            for col in range(28-m+1):
                posteriori[class_num] += math.log(training[class_num][row][col][int(test_image[row][col])])
    answer[i] = np.argmax(posteriori)
    #print(posteriori)
    
#print(answer)



In [8]:
testlabels = readRough("digitdata/testlabels")


In [9]:
def confusion_matrix():
    num_each_class = np.zeros(10)
    conf_matrix = np.zeros(shape=(10,10))
    for i in range(1000):
        conf_matrix[int(testlabels[i])][int(answer[i])] += 1
        num_each_class[int(testlabels[i])] += 1
            
    for i in range(10):
        for j in range(10):
            conf_matrix[i][j] /= num_each_class[i]
    
    return conf_matrix

In [13]:
conf_matrix = confusion_matrix()
conf_matrix = np.around(conf_matrix, 2)
for row in conf_matrix:
    print(row)
overall_accuracy = 0
for i in range(10):
    overall_accuracy += conf_matrix[i][i] * class_[i][1]
print("overall_accuracy = " + str(overall_accuracy/5000))

[ 0.98  0.    0.    0.    0.    0.    0.01  0.    0.01  0.  ]
[ 0.    0.98  0.    0.    0.01  0.    0.01  0.    0.    0.  ]
[ 0.    0.01  0.91  0.02  0.    0.    0.02  0.01  0.03  0.  ]
[ 0.    0.    0.    0.93  0.    0.01  0.    0.03  0.01  0.02]
[ 0.    0.    0.    0.    0.94  0.    0.02  0.01  0.    0.03]
[ 0.01  0.    0.    0.09  0.    0.85  0.    0.01  0.02  0.02]
[ 0.01  0.01  0.    0.    0.    0.05  0.91  0.    0.01  0.  ]
[ 0.    0.03  0.04  0.    0.02  0.    0.    0.8   0.02  0.09]
[ 0.01  0.    0.03  0.07  0.    0.02  0.    0.01  0.84  0.02]
[ 0.01  0.    0.    0.03  0.04  0.01  0.    0.01  0.02  0.88]
overall_accuracy = 0.903024
