In [118]:
import math

#this number can be played with for new results if we have time
SMOOTHING_FACTOR = 5

class Dataset:
    def __init__(self, images_file, labels_file, lines_per_image = 28):
        self.images = []
        self.labels = []
        self.images_by_label = {}
        self.priors = {}
        with open(images_file) as file:
            EOF = False
            while not EOF:
                image = []
                for i in range(lines_per_image):
                    line = file.readline()
                    if not line:
                        EOF = True
                        break
                    image.append(list(line))
                if EOF:
                    break
                self.images.append(image)
        with open(labels_file) as file:
            for line in file:
                self.labels.append(line)
                
    def printImage(self, idx, lines_per_image = 28):
        image = self.images[idx]
        for i in range(lines_per_image):
            print(image[i])
            
class Classifier:
    def __init__(self, label, hMap, prior):
        self.label = label
        self.hMap = hMap
        self.prior = prior
    
    #where should I compare the smoothed heatmap to the input image??
    def evaluate_likelihood(self, image):
        total = math.log10(self.prior)
        for i in range(self.hMap.mapSize):
            for j in range(self.hMap.mapSize):
                partial_eval = self.hMap.hMap_smoothed[i][j] #self.single_probability(i, j, image[i][j])
                if(partial_eval <= 0):
                    print("bad eval, can't take log at spot: " + str(i) + ", " + str(j))
                    self.printImage(image)
                else:
                    y=0
                    #print("valid logarithm")
                total += math.log10(partial_eval)
        return total
    
    def printImage(self, image):
        for i in range(28):
            print(image[i])
    
    def single_probability(self, x, y, testPixel):
        if(testPixel == ' '):
            #testVal = 0
            #number of times a pixel is 0 from all the training examples
            instances = self.hMap.totalExamples - self.hMap.hMap_count[x][y]
            return float(instances / self.hMap.totalExamples)
        else:
            #testVal = 1
            instances = self.hMap.hMap_count[x][y]
            return float(instances / self.hMap.totalExamples)
            
        
        
        
class HeatMap:
    def __init__(self, label, mapSize = 28):
        #initially blank, smoothing will be added later
        self.hMap_count = [[0 for i in range(mapSize)] for j in range(mapSize)]
        self.hMap_smoothed = [[0 for i in range(mapSize)] for j in range(mapSize)]
        self.label = label
        self.mapSize = mapSize
        self.totalExamples = 0
                    
    def printHMap(self, smoothed = True):
        print("Heat map for digit: " + str(self.label) + " from " + str(self.totalExamples) + " examples")
        for i in range(self.mapSize):
            if smoothed:
                print(self.hMap_smoothed[i])
            else:
                print(self.hMap_count[i])

    def addToHMap(self, image):
        self.totalExamples += 1
        for i in range(self.mapSize):
            for j in range(self.mapSize):
                if(image[i][j] != ' '):
                    self.hMap_count[i][j] += 1
    
    def smoothHMap(self):
        #V = 2 since either 1 or 0 for picture
        denominator = SMOOTHING_FACTOR * 2 + self.totalExamples
        for i in range(self.mapSize):
            for j in range(self.mapSize):
                numerator = self.hMap_count[i][j] + SMOOTHING_FACTOR
                self.hMap_smoothed[i][j] = numerator / denominator
                
    def labelProbability(self, trainingSetSize = 5000):
        return self.totalExamples / trainingSetSize
    
    def goodHMap(self):
        for i in range(self.mapSize):
            for j in range(self.mapSize):
                if(self.hMap_smoothed[i][j] <= 0):
                    return False
        
        return True

            
        

In [119]:
dataset = Dataset("trainingimages","traininglabels")

In [120]:
#dataset.printImage(0)
print(len(dataset.images))
print(dataset.labels[0])

5000
5



In [121]:
digitHMaps = [None]*10
for i in range(0,10):
    digitHMaps[i] = HeatMap(i)

for i  in range(len(dataset.images)):
    currLabel = int(dataset.labels[i])
    digitHMaps[currLabel].addToHMap(dataset.images[i])
#digitHMaps[1].printHMap(False)    


In [122]:
for i in range(0,10):
    digitHMaps[i].smoothHMap()
    #print(str(100* digitHMaps[i].labelProbability(len(dataset.images))) + "% for digit: " + str(i))
    validHMap = digitHMaps[i].goodHMap()
    print(validHMap)

#digitHMaps[1].printHMap(True)


True
True
True
True
True
True
True
True
True
True


In [123]:
testData = Dataset("testimages","testlabels")

In [124]:
classifiers = [None]*10
for i in range(0,10):
    classifiers[i] = Classifier(i, digitHMaps[i], digitHMaps[i].labelProbability())

In [125]:
#testData.printImage(0)
print(testData.labels[0])

9



In [126]:
image = testData.images[0]
for i in range(10):
    #all these negative rn, idk what's wrong
    print(str(i) + ": chances are: " + str(classifiers[i].evaluate_likelihood(image)))

0: chances are: -942.8900095994303
1: chances are: -1255.5759225071324
2: chances are: -938.5303756286982
3: chances are: -970.5679101093189
4: chances are: -1017.324274772748
5: chances are: -953.4328245944351
6: chances are: -1028.0167737319296
7: chances are: -1055.2700929737769
8: chances are: -971.7944066138928
9: chances are: -1043.9853274725729
