In [1]:
import csv
import numpy as np

In [2]:
def train_test_split(x, y, test_size=0.2):
    n_test = int(test_size * len(x))
    x = np.array(x)
    y = np.array(y)
    rng = np.random.default_rng()
    perm = rng.permutation(len(x))
    test = perm[:n_test]
    train = perm[n_test:]
    return x[train], x[test], y[train], y[test]

In [3]:
def cv_split(x, k: int = 5):
    """k-fold cross-validation."""
    rng = np.random.default_rng()
    perm = rng.permutation(len(x))
    for split in np.split(perm, k):
        rest = perm[~np.isin(perm, split, assume_unique=True)]
        yield rest, split

In [4]:
classes = []
train_abstracts = []
with open("trg.csv") as fid:
    reader = csv.reader(fid, delimiter=",", quotechar='"')
    # Skip header row
    next(reader)
    for row in reader:
        classes.append(row[1])
        train_abstracts.append(row[2])
unique_classes = sorted(set(classes))
class_to_id = {c: unique_classes.index(c) for c in unique_classes}
train_y = np.array([class_to_id[c] for c in classes])

# print(train_abstracts[0])
# print(train_y[:10])
# print(classes[:10])

In [109]:
class naive_bayes_classifier:
    def __init__(self):
        """
        Total number of classes is stored as self.total_classes_number.
        Total number of each class is stored in self.class_number_dict.
        Total number of unique words is stored as self.unique_word_number.
        Total number of a word in a class is stored in self.class_word_number_dict.
        Total number of words in a class is stored in self.class_total_words_dict.
        """
        self.total_classes_number = 0
        self.class_number_dict = {}
        self.unique_word_number = 0
        self.class_word_number_dict = {}
        self.class_total_words_dict = {}
    
    def fit(self, classes, abstracts):
        classes = np.array(classes)
        abstracts = np.array(abstracts)

        # total number of classes
        self.total_classes_number = len(classes)

        # total number of each class
        c, counts = np.unique(classes, return_counts=True)
        self.class_number_dict = dict(zip(c, counts))

        # total number of unique words
        all_text = ""
        for text in abstracts:
            all_text = all_text + text + " "
        all_text = all_text.split()
        unique_word = set(all_text)
        self.unique_word_number = len(unique_word)

        # total number of a word in a class
        self.class_word_number_dict = {}
        class_text_dict = {}
        for c in unique_classes:
            class_text_dict[c] = ""
        for i in range(len(abstracts)):
            c = classes[i]
            text = abstracts[i]
            class_text_dict[c] = class_text_dict[c] + text + " "
        for c in class_text_dict:
            text = np.array(class_text_dict[c].split())
            word, counts = np.unique(text, return_counts=True)
            self.class_word_number_dict[c] = dict(zip(word, counts))

        # total number of words in a class
        self.class_total_words_dict = {}
        for c in class_text_dict:
            text = class_text_dict[c].split()
            self.class_total_words_dict[c] = len(text)    
    
    def predict(self, ids, abstracts):
        result = []
        for i in range(len(ids)):
            pred_id = ids[i]
            pred_abs = abstracts[i].split()
            
            id_result_prob = []
            for c in unique_classes:
                prob_c = self.class_number_dict[c] / self.total_classes_number
                
                count_10 = 0
                
                prob_x = 1
                word_number_dict = self.class_word_number_dict[c]
                total_words_number = self.class_total_words_dict[c]
                for word in pred_abs:
                    if word in word_number_dict:
                        word_number = word_number_dict[word]
                    else:
                        word_number = 0
                    prob_word = (word_number + 1) / (total_words_number + self.unique_word_number)
                    prob_x = prob_x * prob_word
                    
                    
                    while prob_x < 0.1:
                        prob_x = prob_x * 10
                        count_10 += 1
                        
                prob = prob_c * prob_x
                
                while prob < 0.1:
                    prob = prob * 10
                    count_10 += 1
                
                id_result_prob.append([c, prob, count_10])
                
            id_result_prob = sorted(id_result_prob, key=lambda x: (-x[2], x[1]))
            result.append((pred_id, id_result_prob[-1][0]))
            result.append((pred_id, id_result_prob))
        
        return result

In [110]:
nbc = naive_bayes_classifier()
nbc.fit(classes, train_abstracts)

In [111]:
rs = nbc.predict(classes, train_abstracts)

In [112]:
count = 0
for r in rs:
    if r[0] != r[1][-1][0]:
        print(r)
        count += 1

('A', [['V', 0.13881072133247085, 753], ['A', 0.2312802948042389, 730], ['E', 0.3941260292610694, 704], ['B', 0.40561489083220037, 704]])
('V', [['A', 0.18056729765234808, 639], ['B', 0.34890264275256283, 610], ['V', 0.2583806933607637, 600], ['E', 0.9679909219241977, 597]])
('A', [['V', 0.46413874792397547, 399], ['A', 0.773765366110749, 394], ['B', 0.3594326829596004, 375], ['E', 0.49331806810744705, 373]])
('V', [['A', 0.2466941061986831, 393], ['E', 0.852733187154619, 381], ['V', 0.10152145623849063, 374], ['B', 0.8984900289240134, 360]])
('V', [['A', 0.14195617874316757, 955], ['V', 0.6998124065726989, 911], ['E', 0.15532650967492911, 903], ['B', 0.14509423622243817, 899]])
('B', [['A', 0.6939617885766023, 881], ['V', 0.26946594646504207, 853], ['B', 0.3367437899018666, 798], ['E', 0.7052306398576279, 798]])
('B', [['A', 0.5355341192874888, 535], ['V', 0.12078301767469672, 524], ['B', 0.7613108507282779, 486], ['E', 0.20580117970948675, 479]])
('B', [['A', 0.8381218561642589, 133]

In [113]:
print(count)

89
