In [1]:
import math
import pandas as pd
import numpy as np

#this number can be played with for new results if we have time
SMOOTHING_FACTOR = 0.1
NUM_ROWS = 25
NUM_COLS = 10

class Dataset:
    def __init__(self, audio_file, word):
        self.images = []
        self.size = 0
        self.words = []
        with open(audio_file) as file:
            EOF = False
            while not EOF:
                image = []
                for i in range(NUM_ROWS):
                    line = file.readline()
                    if not line:
                        EOF = True
                        break
                    image.append(list(line))
                if EOF:
                    break
                else:
                    for i in range(3):
                        line = file.readline()
                    self.size += 1
                    self.words.append(word)
                    self.images.append(image)
    def display(self, i):
        print("".join(map(lambda x: "".join(x),self.images[i])))
        
class Classifier:
    def __init__(self, word, countMap, prior):
        self.word = word
        self.prior = prior
        self.countMap = countMap
        
    def evaluate_likelihood(self, image):
        total = math.log10(self.prior)
        for i in range(NUM_ROWS):
            for j in range(NUM_COLS):
                partial_eval = self.single_probability(i, j, image[i][j])
                total += math.log10(partial_eval)
        return total
    
    def printImage(self, image):
        print("".join(map(lambda x: "".join(x),image)))
    
    def single_probability(self, x, y, testPixel):
        if(testPixel == '%'):
            #testVal = 0
            #number of times a pixel is 0 from all the training examples
            instances = self.countMap.totalExamples - self.countMap.count[x][y]
            return float(instances / self.countMap.totalExamples)
        elif testPixel == ' ':
            #testVal = 1
            instances = self.countMap.count[x][y]
            return float(instances / self.countMap.totalExamples)
        
class countMap:
    def __init__(self, word):
        self.word = word
        self.count = [[SMOOTHING_FACTOR for i in range(NUM_COLS)] for j in range(NUM_ROWS)]
        self.totalExamples = 2*SMOOTHING_FACTOR
    
    def add_counts(self, image):
        self.totalExamples += 1
        for i in range(NUM_ROWS):
            for j in range(NUM_COLS):
                if(image[i][j] == ' '):
                    self.count[i][j] += 1
                    
    def labelProbability(self, trainingSetSize):
        return (self.totalExamples-2*SMOOTHING_FACTOR) / trainingSetSize

In [2]:
yesData = Dataset("yes_train.txt", "yes")
noData = Dataset("no_train.txt", "no")

In [3]:
yesMap = countMap("yes")
noMap = countMap("no")

for i in range(len(yesData.images)):
    yesMap.add_counts(yesData.images[i])
    
for i in range(len(noData.images)):
    noMap.add_counts(noData.images[i])

In [4]:
yesTest = Dataset("yes_test.txt", "yes")
noTest = Dataset("no_test.txt", "no")

testData = Dataset("yes_test.txt", "yes")
testData.images += noTest.images
testData.size += noTest.size
testData.words += noTest.words

In [5]:
classifiers = [None]*2
classifiers[0] = Classifier("yes", yesMap, yesMap.labelProbability(yesData.size))
classifiers[1] = Classifier("no", noMap, noMap.labelProbability(noData.size))

In [6]:
predictions = []
#save indices in test data for max and min values from eval function for each word
#initialized with the first time that word shows up in the test file
maxes = [0, yesTest.size]
mins = [0, yesTest.size]
idx = 0
wordIdx = 0
for image in testData.images:
    chances = [0]*2
    for i in range(2):
        chances[i] = classifiers[i].evaluate_likelihood(image)
    label = chances.index(max(chances))
    predictions.append(label)
    
    word = testData.words[idx]
    if word == "yes":
        actualLabel = 0
    else:
        actualLabel = 1
    currEval = chances[actualLabel]
    prevMax = classifiers[actualLabel].evaluate_likelihood(testData.images[maxes[actualLabel]])
    prevMin = classifiers[actualLabel].evaluate_likelihood(testData.images[mins[actualLabel]])
    if(currEval >= prevMax):
        maxes[actualLabel] = idx
    if(currEval <= prevMin):
        mins[actualLabel] = idx
    
    idx += 1

In [7]:
idx = 0
currWord = "yes"
for MAX in maxes:
    print("max posterior for digit: " + currWord)
    testData.display(MAX)
    idx+=1
    currWord = "no"

max posterior for digit: yes
% %   % %%
        %%
        %%
       %%%
      %%%%
%       %%
%      %%%
%%%% %%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%% %%%%%
%%% %%%%%%
%%% %%%%%%
%% %%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%  %%%%%%

max posterior for digit: no
% %   % %%
        %%
        %%
        %%
      % %%
%%      % 
%%        
%%%    %%%
%%%%%%%%%%
%%%%%% %%%
%%%%%% %%%
%%%%%% %%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%



In [8]:
idx = 0
currWord = "yes"
for MIN in mins:
    print("min posterior for digit: " + currWord)
    testData.display(MIN)
    idx+=1
    currWord = "no"

min posterior for digit: yes
%%% %%    
 %%       
%%%       
%%%       
%%%       
%%%     % 
%%%%%     
%%%%%%% %%
%%%%%%%  %
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%% %%  %
%%%% %    
%%%%      
%%%%  %  %
%%%%    %%
%%%%%  %%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%  %%%
%%%%  % %%

min posterior for digit: no
% %%      
%%%%      
%%%%      
%%%%      
%%%%     %
%%%%      
%%%%      
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%% %
%%%%%% % %
%%%%%    %
%%%%%   %%
%%%%%% %%%
%%%%  %%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%%%%%%%
%%%%% %%%%



In [9]:
wrongs = [0]*2
raw_counts = [[0 for i in range(2)] for j in range(2)]
for i in range(len(predictions)):
    if testData.words[i] == "yes":
        label = 0
    else:
        label = 1
    if predictions[i] != label:
        wrongs[predictions[i]] +=1
    raw_counts[label][predictions[i]] +=1

In [10]:
print("yes success rate: " + str(100-(100*(wrongs[0]/(yesMap.totalExamples-2*SMOOTHING_FACTOR)))))
print("no success rate: " + str(100-(100*(wrongs[1]/(yesMap.totalExamples-2*SMOOTHING_FACTOR)))))

print("\nconfusion matrix\n")

yes success rate: 97.85714285714286
no success rate: 99.28571428571429

confusion matrix



In [11]:
counts_df = pd.DataFrame(np.array(raw_counts))

In [12]:
confusion_matrix = counts_df.apply(lambda x: x/np.sum(x), axis=1)
print(confusion_matrix.round(2))

      0     1
0  0.98  0.02
1  0.06  0.94
