# Assignment 1, COMP 472
## Due October 19th, 2020
## Use of Python 3.8
## Marc Vicuna, 40079109, El Hassan Ait Ouaziz, 26791573

In [118]:
import pandas as pd
import numpy as np

In [119]:
training_file = 'data/covid_training.tsv'
data = pd.read_csv(training_file, sep='\t', usecols=[1,2])
documents = data['text'].values
labels = data['q1_label'].values

In [120]:
def add_document(doc, lab, ov):
    doc = doc.casefold()
    for word in doc.split(' '):
        if word in ov:
            ov[word] += lab
        else:
            ov[word] = lab

In [121]:
ov = {}
priors = np.zeros(2)
for doc, lab in zip(documents, labels):
    if lab == 'yes':
        priors[0] += 1
        add_document(doc, np.array([1,0], dtype = 'i4'), ov)
    else:
        priors[1] += 1
        add_document(doc, np.array([0,1], dtype = 'i4'), ov)
priors = priors/np.sum(priors)

In [122]:
fv = {}
ov_sum = np.zeros(2, dtype = 'i4')
fv_sum = np.zeros(2, dtype = 'i4')
for word, freq in ov.items():
    ov_sum += freq
    if np.sum(freq) > 1:
        fv[word] = freq
        fv_sum += freq

In [124]:
# smoothing
d = 0.01
# vocabulary size
V = len(ov)
# conditional probability on ov
for word, freq in ov.items():
    ov[word] = np.log10(np.array([(freq[0]+d)/(ov_sum[0]+d*V),(freq[1]+d)/(ov_sum[1]+d*V)]))
# conditional probability on fv
for word, freq in fv.items():
    fv[word] = np.log10(np.array([(freq[0]+d)/(fv_sum[0]+d*V),(freq[1]+d)/(fv_sum[1]+d*V)]))

In [125]:
# process test data
test_file = 'data/covid_test_public.tsv'
data = pd.read_csv(test_file, sep='\t',header = None, usecols=[0,1,2])
test_IDs = np.array(data[0].values, dtype = 'i4')
test_documents = data[1].values
test_labels = data[2].values

In [130]:
def evaluate_document(doc, v, priors):
    prob = np.log10(priors.copy())
    doc = doc.casefold()
    for word in doc.split(' '):
        if word in v:
            prob += v[word]
    if np.argmax(prob) == 0:
        return 'yes', prob[0]
    else:
        return 'no', prob[1]

In [131]:
evaluate_document(test_documents[0], ov, priors)

('yes', -101.6057968039355)

In [147]:
# Generate Trace files
def trace(filename, IDs, documents, labels, v, priors):
    f = open(filename, 'w')
    for ID, doc, true_lab in zip(IDs, documents, labels):
        estim_lab, prob = evaluate_document(doc, v, priors)
        if estim_lab == true_lab:
            match = 'correct'
        else:
            match = 'wrong'
        f.write('{}  {}  {:.2E}  {}  {}\n'.format(ID, estim_lab, 10**prob, true_lab, match))

In [148]:
trace('trace/trace_NB-BOW-OV.txt', test_IDs, test_documents, test_labels, ov, priors)
trace('trace/trace_NB-BOW-FV.txt', test_IDs, test_documents, test_labels, fv, priors)

In [171]:
# Generate Evaluation files
def evalation(input_file, output_file):
    data = pd.read_csv(input_file, sep='  ', header = None, usecols=[1,3,4], engine='python')
    estim_l = data[1].values
    true_l = data[3].values
    match = data[4].values
    accuracy, precision, recall = 0.0, np.array([0,0], dtype = 'f4'), np.array([0,0], dtype = 'f4')
    for est, tru, mat in zip(estim_l, true_l, match):
        if mat == 'correct':
            accuracy += 1
            if est == 'yes':
                precision[0] += 1
            else:
                precision[1] += 1
            if tru == 'yes':
                recall[0] += 1
            else:
                recall[1] += 1
    print(precision)
    print(recall)
    accuracy /= len(match)
    
    precision /= len(match)
    recall /= len(match)
    F1 = [2/(precision[0]**-1+recall[0]**-1),2/(precision[1]**-1+recall[1]**-1)]
    f = open(output_file, 'w')
    f.write('{:.4}\n'.format(accuracy))
    for metric in [precision, recall, F1]:
        f.write('{:.4}  {:.4}\n'.format(metric[0], metric[1]))
            

In [172]:
evalation('trace/trace_NB-BOW-OV.txt', 'evaluation/eval_NB-BOW-OV.txt')
evalation('trace/trace_NB-BOW-FV.txt', 'evaluation/eval_NB-BOW-FV.txt')

[31.  4.]
[31.  4.]
[32.  3.]
[32.  3.]
