In [48]:
import math

#this number can be played with for new results if we have time
SMOOTHING_FACTOR = 0.5

class Dataset:
    def __init__(self, images_file, labels_file, lines_per_image = 28):
        self.images = []
        self.labels = []
        self.images_by_label = {}
        self.priors = {}
        with open(images_file) as file:
            EOF = False
            while not EOF:
                image = []
                for i in range(lines_per_image):
                    line = file.readline()
                    if not line:
                        EOF = True
                        break
                    image.append(list(line))
                if EOF:
                    break
                self.images.append(image)
        with open(labels_file) as file:
            for line in file:
                self.labels.append(line)
class Classifier:
    def __init__(self, label, hMap, prior):
        self.label = label
        self.hMap = hMap
        self.prior = prior
        
    def evaluate_likelihood(self, image):
        total = log10(self.prior)
        for i in range(self.hMap.mapSize):
            for j in range(self.hMap.mapSize):
                total += log10(self.single_probability(i, j, image[i][j]))
        return total
    
    def single_probability(self, x, y, testPixel):
        if(testPixel == ' '):
            #testVal = 0
            #number of times a pixel is 0 from all the training examples
            instances = self.hMap.totalExamples - self.hMap.hMap_count[i][j]
            return instances / self.hMap.totalExamples
        else:
            #testVal = 1
            instances = self.hMap.hMap_count[i][j]
            return instances / self.hMap.totalExamples
            
        
        
        
class HeatMap:
    def __init__(self, label, mapSize = 28):
        #initially blank, smoothing will be added later
        self.hMap_count = [[0 for i in range(mapSize)] for j in range(mapSize)]
        self.hMap_smoothed = [[0 for i in range(mapSize)] for j in range(mapSize)]
        self.label = label
        self.mapSize = mapSize
        self.totalExamples = 0
                    
    def printHMap(self, smoothed = True):
        print("Heat map for digit: " + str(self.label) + " from " + str(self.totalExamples) + " examples")
        for i in range(self.mapSize):
            if smoothed:
                print(self.hMap_smoothed[i])
            else:
                print(self.hMap_count[i])

    def addToHMap(self, image):
        self.totalExamples += 1
        for i in range(self.mapSize):
            for j in range(self.mapSize):
                if(image[i][j] != ' '):
                    self.hMap_count[i][j] += 1
    
    def smoothHMap(self):
        #V = 2 since either 1 or 0 for picture
        denominator = SMOOTHING_FACTOR * 2 + self.totalExamples
        for i in range(self.mapSize):
            for j in range(self.mapSize):
                numerator = self.hMap_count[i][j] + SMOOTHING_FACTOR
                self.hMap_smoothed[i][j] = numerator / denominator
                
    def labelProbability(self, trainingSetSize = 5000):
        return self.totalExamples / trainingSetSize

            
        

In [41]:
dataset = Dataset("trainingimages","traininglabels")

In [42]:
print(dataset.images[0][7][13])
print(len(dataset.images))
print(dataset.labels[0])

#
5000
5



In [44]:
digitHMaps = [None]*10
for i in range(0,10):
    digitHMaps[i] = HeatMap(i)

for i  in range(len(dataset.images)):
    currLabel = int(dataset.labels[i])
    digitHMaps[currLabel].addToHMap(dataset.images[i])
digitHMaps[1].printHMap(False)    


Heat map for digit: 1 from 563 examples
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 7, 6, 10, 6, 5, 5, 3, 2, 2, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 1, 0, 0, 1, 1, 2, 9, 22, 50, 98, 129, 140, 128, 139, 140, 119, 78, 36, 14, 2, 1, 0, 0, 0]
[0, 0, 1, 1, 0, 0, 2, 3, 3, 4, 16, 39, 81, 162, 216, 250, 265, 263, 250, 208, 122, 61, 23, 5, 2, 0, 1, 0]
[0, 0, 0, 0, 0, 0, 2, 3, 5, 5, 20, 42, 90, 176, 261, 297, 323, 309, 275, 217, 123, 58, 18, 4, 2, 0, 0, 0]
[0, 0, 0, 1, 1, 1, 3, 4, 5, 6, 16, 41, 97, 192, 289, 351, 369, 335, 278, 197, 98, 37, 11, 3, 0, 0, 0, 0]
[0, 0, 0, 1, 1, 0, 1, 3, 5, 6, 13, 42, 103, 204, 325, 408, 417, 333, 267, 156, 58, 20, 7, 0, 0, 0, 0, 0]
[0, 1, 0, 1, 0, 0, 1, 2, 4, 7, 12, 40, 102, 227, 375, 474, 431, 325, 222, 116, 32, 5, 1, 0, 0,

In [45]:
for i in range(0,10):
    digitHMaps[i].smoothHMap()
    #print(str(100* digitHMaps[i].labelProbability(len(dataset.images))) + "% for digit: " + str(i))

digitHMaps[1].printHMap(True)


Heat map for digit: 1 from 563 examples
[0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354]
[0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226950354, 0.0008865248226

In [46]:
testData = Dataset("testimages","testlabels")

In [47]:
classifiers = [None]*10
for i in range(0,10):
    classifiers[i] = Classifier(i, digitHMaps[i], digitHMaps[i].labelProbability())