In [70]:
import math

#this number can be played with for new results if we have time
SMOOTHING_FACTOR = 0.9

class Dataset:
    def __init__(self, images_file, labels_file, lines_per_image = 28):
        self.images = []
        self.labels = []
        self.images_by_label = {}
        self.priors = {}
        with open(images_file) as file:
            EOF = False
            while not EOF:
                image = []
                for i in range(lines_per_image):
                    line = file.readline()
                    if not line:
                        EOF = True
                        break
                    image.append(list(line))
                if EOF:
                    break
                self.images.append(image)
        with open(labels_file) as file:
            for line in file:
                self.labels.append(line)
    def display(self, i):
        print("".join(map(lambda x: "".join(x),self.images[i])))
        
class Classifier:
    def __init__(self, label, hMap, prior):
        self.label = label
        self.hMap = hMap
        self.prior = prior
    
    def evaluate_likelihood(self, image):
        total = math.log10(self.prior)
        for i in range(self.hMap.mapSize):
            for j in range(self.hMap.mapSize):
                partial_eval = self.single_probability(i, j, image[i][j])
                total += math.log10(partial_eval)
        return total
    
    def printImage(self, image):
        print("".join(map(lambda x: "".join(x),image)))
    
    def single_probability(self, x, y, testPixel):
        if(testPixel == ' '):
            #testVal = 0
            #number of times a pixel is 0 from all the training examples
            instances = self.hMap.totalExamples - self.hMap.hMap_count[x][y]
            return float(instances / self.hMap.totalExamples)
        elif testPixel != "\n":
            #testVal = 1
            instances = self.hMap.hMap_count[x][y]
            return float(instances / self.hMap.totalExamples)
            
            
class HeatMap:
    def __init__(self, label, mapSize = 28):
        self.hMap_count = [[SMOOTHING_FACTOR for i in range(mapSize)] for j in range(mapSize)]
        self.label = label
        self.mapSize = mapSize
        self.totalExamples = 2*SMOOTHING_FACTOR
                    
    def printHMap(self, smoothed = True):
        print("Heat map for digit: " + str(self.label) + " from " + str(self.totalExamples) + " examples")
        for i in range(self.mapSize):
            if smoothed:
                toPrint = ""
                for j in range(self.mapSize):
                    if self.hMap_count[i][j] < 1:
                        toPrint += " "
                    elif self.hMap_count[i][j] < 80:
                        toPrint += "*"
                    else:
                        toPrint += "X"
                print(toPrint)
                
            else:
                toPrint = ""
                for j in range(self.mapSize):
                    if self.hMap_count[i][j] < 3:
                        toPrint += " "
                    elif self.hMap_count[i][j] < 80:
                        toPrint += "*"
                    else:
                        toPrint += "X"
                print(toPrint)

    def addToHMap(self, image):
        self.totalExamples += 1
        for i in range(self.mapSize):
            for j in range(self.mapSize):
                if(image[i][j] != ' '):
                    self.hMap_count[i][j] += 1
                
    def labelProbability(self, trainingSetSize = 5000):
        return (self.totalExamples-2*SMOOTHING_FACTOR) / trainingSetSize
    
    def goodHMap(self):
        for i in range(self.mapSize):
            for j in range(self.mapSize):
                if(self.hMap_smoothed[i][j] <= 0):
                    return False
        
        return True

            
        

In [71]:
dataset = Dataset("trainingimages","traininglabels")

In [72]:
digitHMaps = [None]*10
for i in range(0,10):
    digitHMaps[i] = HeatMap(i)

for i  in range(len(dataset.images)):
    currLabel = int(dataset.labels[i])
    digitHMaps[currLabel].addToHMap(dataset.images[i])  
    

In [73]:
testData = Dataset("testimages","testlabels")

In [74]:
classifiers = [None]*10
for i in range(0,10):
    classifiers[i] = Classifier(i, digitHMaps[i], digitHMaps[i].labelProbability())
    

In [75]:
predictions = []
for image in testData.images:
    chances = [0]*10
    for i in range(10):
        chances[i] = classifiers[i].evaluate_likelihood(image)
    label = chances.index(max(chances))
    predictions.append(label)


In [76]:
wrongs = [0]*10
confusion_matrix = [[0 for i in range(10)] for j in range(10)]
for i in range(len(predictions)):
    if predictions[i] != int(testData.labels[i]):
        wrongs[predictions[i]] +=1
        confusion_matrix[int(testData.labels[i])][predictions[i]] +=1


In [77]:
for i in range(10):
    print(str(i) + " label success rate: " + str(100-(100*(wrongs[i]/(digitHMaps[i].totalExamples-2*SMOOTHING_FACTOR)))) + "%")

print("\nconfusion matrix\n")
for i in range(10):
    print(str(confusion_matrix[i]))


0 label success rate: 98.53862212943632%
1 label success rate: 96.0923623445826%
2 label success rate: 97.1311475409836%
3 label success rate: 93.30628803245436%
4 label success rate: 95.70093457943925%
5 label success rate: 94.70046082949308%
6 label success rate: 96.8063872255489%
7 label success rate: 97.81818181818181%
8 label success rate: 95.45454545454545%
9 label success rate: 88.28282828282828%

confusion matrix

[0, 0, 1, 0, 1, 5, 3, 0, 4, 0]
[0, 0, 1, 0, 0, 2, 1, 0, 0, 0]
[1, 3, 0, 4, 1, 0, 6, 1, 5, 2]
[0, 2, 0, 0, 0, 3, 2, 6, 2, 6]
[0, 1, 0, 0, 0, 0, 3, 1, 2, 18]
[2, 2, 1, 12, 3, 0, 1, 1, 2, 6]
[1, 6, 4, 0, 4, 5, 0, 0, 2, 0]
[0, 6, 3, 0, 3, 0, 0, 0, 3, 14]
[2, 1, 3, 14, 2, 6, 0, 1, 0, 12]
[1, 1, 1, 3, 9, 2, 0, 2, 1, 0]
