In [692]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import math
from scipy.special import softmax

class NBClassifier:
    def __init__(self) -> None:
        self.vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
                'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']

        self.vectorizer = CountVectorizer(analyzer='char', vocabulary=self.vocab)
        
        self.training_data = None
        self.priors = {}
        self.model = self.get_training_data()
        self.summerize_cols()
        self.calculate_priors()

    def calculate_priors(self):
        predictions = []
        num_training_samples = 0
        for key, value in self.training_data.items():
            num_training_samples += len(value)

        for key, value in self.training_data.items():
            self.priors[key] = (len(value) * 1.0)/num_training_samples

    def get_training_data(self):
        training_data = {}

        files = os.listdir('./languageID')
        for file in files:
            file_name = file.split('.')[0]
            number = int(file_name[1:])

            if number >= 10:
                continue

            f = open(f"./languageID/{file}", 'r')
            if file[0] in training_data:
                training_data[file[0]].append(f.read())
            else:
                training_data[file[0]] = [f.read()]
            f.close()

        self.training_data = training_data

        training_vectorized = {}
        for key, value in training_data.items():
            X = self.vectorizer.fit_transform(value)
            training_vectorized[key] = {
                'data': X.A
            }

        return training_vectorized

    def count_total_words(self):
        total_words = {}
        for key, value in self.model.items():
            data = value['data']
            language_words = 0
            for col_idx in range(data.shape[1]):
                col = data[:, col_idx]
                language_words += col.sum()
            total_words[key] = language_words
        
        return total_words

    def summerize_cols(self):
        total_words = self.count_total_words()

        for key, value in self.model.items():
            stats = [] 
            data = value['data']
            for col_idx in range(data.shape[1]):
                col = data[:, col_idx]
                stats.append((col.sum()/total_words[key], np.sum(col > 0)))
            
            self.model[key]['stats'] = stats
        
    def predict(self, query):
        likelihoods = []
        query_vector = self.vectorizer.fit_transform([query]).A.tolist()[0]
        class_probs = []
        
        for key, pred_class in self.model.items():
            class_prob = math.log(self.priors[key]) 
            # class_prob = 1
            stats = pred_class['stats']
            for index, (train_char_prob, count) in enumerate(stats):
                # print(query_vector[index])
                total_counts = sum([stat[0] for stat in stats])
                try:
                    char_prob = query_vector[index] * math.log(train_char_prob)
                except:
                    alpha = 1/2
                    char_prob = math.log((count + alpha)/(total_counts + (3 * alpha)))      
                
                class_prob += char_prob
            
            class_probs.append(class_prob)
            likelihoods.append((key, class_prob))

        # sum_class_probs = sum(class_probs)
        # class_probs = softmax(class_probs)
        # for index, (key, prob) in enumerate(likelihoods):
        #     # likelihoods[index] = (key, class_probs[index])
        #     likelihoods[index] = (key, prob)
            
        return sorted(likelihoods, key=lambda x: x[1], reverse=True)

In [693]:
nb = NBClassifier()

In [694]:
nb.predict(open('./languageID/e14.txt').read())

[('e', -4687.356951143071),
 ('s', -5062.590670182352),
 ('j', -5177.933296877222)]

In [695]:
preds = nb.predict(open('./languageID/e14.txt').read())
total = 0
for lang, val in preds:
    total += val
print(total)

-14927.880918202645


In [696]:
max(nb.predict(open('./languageID/s12.txt').read()), key=lambda x: x[1])

('s', -5518.4039701552565)

In [697]:
preds = []
files = os.listdir('./languageID')
for file in files:
    file_name = file.split('.')[0]
    number = int(file_name[1:])

    if number < 10:
        continue

    f = open(f"./languageID/{file}", 'r')
    pred = max(nb.predict(f.read()), key=lambda x: x[1])[0]
    if pred == file_name[0]:
        preds.append(1)
    else:
        preds.append(0)

In [701]:
accuracy = sum(preds)/len(preds)
accuracy

1.0

In [699]:
print(sum(preds), len(preds))

30 30


In [700]:
accuracy

1.0