*NOTE*: Functions were built with the assumption that data files follow the tweet_id, text, q1_label, q2_label, q3_label, q4_label, q5_label, q6_label, q7_label format

In [311]:
# imports
import numpy as np
import math

# Training

In [312]:
# Declare function to import data from TSV file

import csv

def importTSV(file_name):
    tsv_file = open(file_name, encoding="utf8")
    read_tsv = csv.reader(tsv_file, delimiter="\t")
        
    training_data = []
    
    for row in read_tsv:
        training_data.append(row)
        
    return training_data

In [313]:
# Declare function to convert data to lowercase

def convertToLowerCase(training_data):
    for row in training_data:
        row[1] = row[1].lower()
    
    return training_data

In [314]:
# Declare function to build ORIGINAL VOCABULARY

def buildOriginalVocabulary(training_data):
    vocab = set()
    training_data.pop(0)
    for row in training_data:
        for word in row[1].split():
            vocab.add(word)
        
    vocab = list(vocab)
    vocab.sort()
        
    return vocab
            

In [315]:
# Declare function to build FILTERED VOCABULARY (only words appearing twice)

def buildFilteredVocabulary(training_data):
    vocab = {}
    training_data.pop(0)
    for row in training_data:
        for word in row[1].split():
            if word not in vocab:
                vocab[word] = 1
            else:
                vocab[word] = vocab[word] + 1
    
    for word in list(vocab):
        if vocab[word] == 1:
            vocab.pop(word, None) 
    
    vocab = list(vocab)
    vocab.sort()
    
    return vocab
            

In [316]:
# import training data

training_data = importTSV("covid_training.tsv")
training_data = convertToLowerCase(training_data)
#print(training_data)

In [317]:
# get original vocab

og_vocab = buildOriginalVocabulary(training_data)
#print(og_vocab)

In [318]:
# get filtered vocab

filtered_vocab = buildFilteredVocabulary(training_data)
#print(filtered_vocab)

In [319]:
def TrainNaiveBayesAlgorithm(labels, vocab, training_data, smoothing_value):
    
    cond_probs = {}
    

    
    for label in labels: #class in classes, ie yes/no
        
        # init cond_probs
        cond_probs[label] = {}
        for word in vocab:
            cond_probs[label][word] = smoothing_value
            
        total_word_count_for_label = 0
        
        # get word count for each word in this label
        for data in training_data: # get prob of word given label
            if data[2] == label: # check data that match label
                total_word_count_for_label = total_word_count_for_label + 1        

                for word in data[1].split(): # iterate through words of document
                    if word in vocab: # only count words in vocabulary
                        cond_probs[label][word] = cond_probs[label][word] + 1
        
        # divide specific word count by total word count for this label
        for word in cond_probs[label]:
            cond_probs[label][word] = cond_probs[label][word]/total_word_count_for_label
        

    probs = {}
            
    for label in labels: # get prob of label
        count = 0
        for data in training_data:
            if data[2] == label:
                count = count +1
                
        probs[label] =  count/len(training_data)
    
    return cond_probs, probs

In [320]:
def TestNaiveBayesAlgorithm(labels, probs, cond_probs, vocab, data_text):
    scores = {}
    for label in labels:
        scores[label] = math.log10(probs[label])
        for word in data_text.split():
            if word in vocab:
                scores[label] = scores[label] + math.log10(cond_probs[label][word])
                
    return scores

In [321]:
cond_probs, probs = TrainNaiveBayesAlgorithm(['yes', 'no'], og_vocab, training_data, 0.01)
print(probs)

{'yes': 0.6206030150753769, 'no': 0.3793969849246231}


In [322]:
scores = TestNaiveBayesAlgorithm(['yes', 'no'], probs, cond_probs, og_vocab, training_data[4][1])

In [323]:
# get NaiveBayesAlgorithm training performance ORIGINAL VOCABULARY

def NaiveBayesAlgorithmTrainAndTestPerformance(training_data, smoothing_value, which_vocab):
    
    labels = ['yes', 'no']
    
    correct_count = 0
    incorrect_count = 0
    
    vocab = []
    
    if which_vocab == 'original':
        vocab = buildOriginalVocabulary(training_data)
    elif which_vocab == 'filtered':
        vocab = buildFilteredVocabulary(training_data)
    
    cond_probs, probs = TrainNaiveBayesAlgorithm(labels, vocab, training_data, smoothing_value)

    for data in training_data:
        scores = TestNaiveBayesAlgorithm(labels, probs, cond_probs, vocab, data[1])
        
        isYes = False
        if scores['yes']>scores['no']:
            isYes = True
            
        if isYes and data[2] == 'yes':
            correct_count = correct_count + 1
        elif not isYes and data[2] == 'no':
            correct_count = correct_count + 1
        else:
            incorrect_count = incorrect_count + 1
            

    return correct_count, incorrect_count, len(training_data)

In [324]:
correct_count, incorrect_count, total_count = NaiveBayesAlgorithmTrainAndTestPerformance(training_data, 0.01, 'original')
print("Training performance (original vocabulary): " + str(correct_count/total_count)) 
print("The above may be overfitting, we will see about the generalized results when running against the testing")

print('')

correct_count, incorrect_count, total_count = NaiveBayesAlgorithmTrainAndTestPerformance(training_data, 0.01, 'filtered')
print("Training performance (filtered vocabulary): " + str(correct_count/total_count))

Training performance (original vocabulary): 0.9974811083123426
The above may be overfitting, we will see about the generalized results when running against the testing

Training performance (filtered vocabulary): 0.898989898989899


# Testing

In [325]:
# train with training data and test on testing_data

def NaiveBayesAlgorithmTestPerformance(training_data, testing_data, smoothing_value, which_vocab):
    
    labels = ['yes', 'no']
    
    correct_count = 0
    incorrect_count = 0
    
    vocab = []
    
    if which_vocab == 'original':
        vocab = buildOriginalVocabulary(training_data)
    elif which_vocab == 'filtered':
        vocab = buildFilteredVocabulary(training_data)
    
    cond_probs, probs = TrainNaiveBayesAlgorithm(labels, vocab, training_data, smoothing_value)

    for data in testing_data:
        scores = TestNaiveBayesAlgorithm(labels, probs, cond_probs, vocab, data[1])
        
        isYes = False
        if scores['yes']>scores['no']:
            isYes = True
            
        if isYes and data[2] == 'yes':
            correct_count = correct_count + 1
        elif not isYes and data[2] == 'no':
            correct_count = correct_count + 1
        else:
            incorrect_count = incorrect_count + 1
            

    return correct_count, incorrect_count, len(testing_data)


In [326]:
testing_data = importTSV("covid_test_public.tsv")
testing_data = convertToLowerCase(testing_data)

# Print performance of original vocabulary on testing data
correct_count, incorrect_count, total_count = NaiveBayesAlgorithmTestPerformance(training_data, testing_data, 0.01, 'original')
print("Testing performance (original vocabulary): " + str(correct_count/total_count))
print("The above may be overfitting, we will see about the generalized results when running against the testing")

print('')

# Print performance of filtered vocabulary on testing data
correct_count, incorrect_count, total_count = NaiveBayesAlgorithmTestPerformance(training_data, testing_data, 0.01, 'filtered')
print("Testing performance (filtered vocabulary): " + str(correct_count/total_count))

Testing performance (original vocabulary): 0.6181818181818182
The above may be overfitting, we will see about the generalized results when running against the testing

Testing performance (filtered vocabulary): 0.6545454545454545


As we can see, the original vocabulary results in a performance of 61.82%, whereas the filtered vocabulary results in a performance of 65.45%. Therefore, we notice that the original vocabulary overfits the training data, and the filtered vocabulary is more generalizable.

# Output

## Trace Files

In [327]:
# train with training data and test on testing_data, and output trace to file

def NaiveBayesAlgorithmTestPerformanceWithTraceOutputToFile(training_data, testing_data, smoothing_value, which_vocab):
    
    labels = ['yes', 'no']
    
    correct_count = 0
    incorrect_count = 0
    
    file_name = ""
    
    vocab = []
    
    if which_vocab == 'original':
        vocab = buildOriginalVocabulary(training_data)
        file_name = "trace_NB-BOW-OV.txt"
    elif which_vocab == 'filtered':
        vocab = buildFilteredVocabulary(training_data)
        file_name = "trace_NB-BOW-FV.txt"
    else:
        assert(False)
    
    cond_probs, probs = TrainNaiveBayesAlgorithm(labels, vocab, training_data, smoothing_value)

    f = open(file_name, 'w')
    
    for data in testing_data:
        scores = TestNaiveBayesAlgorithm(labels, probs, cond_probs, vocab, data[1])
        
        yes_no_label = 'no'
        correct_status = 'wrong'
        
        isYes = False
        if scores['yes']>scores['no']:
            yes_no_label = 'yes'
            isYes = True
            
        if isYes and data[2] == 'yes':
            correct_count = correct_count + 1
            correct_status = 'correct'
        elif not isYes and data[2] == 'no':
            correct_count = correct_count + 1
            correct_status = 'correct'
        else:
            incorrect_count = incorrect_count + 1
            
        f.write(str(data[0]) + "  " + yes_no_label + "  " + str(scores[yes_no_label]) + "  " + data[2] + "  " + correct_status + "\n")


In [328]:
NaiveBayesAlgorithmTestPerformanceWithTraceOutputToFile(training_data, testing_data, 0.01, 'filtered')
NaiveBayesAlgorithmTestPerformanceWithTraceOutputToFile(training_data, testing_data, 0.01, 'original')


## Overall Evaluation Files

In [329]:
# train with training data and test on testing_data, and output trace to file

def NaiveBayesAlgorithmTestPerformanceWithEvaluationOutputToFile(training_data, testing_data, smoothing_value, which_vocab):
    
    labels = ['yes', 'no']
    
    file_name = ""
    
    vocab = []
    
    if which_vocab == 'original':
        vocab = buildOriginalVocabulary(training_data)
        file_name = "eval_NB-BOW-OV.txt"
    elif which_vocab == 'filtered':
        vocab = buildFilteredVocabulary(training_data)
        file_name = "eval_NB-BOW-FV.txt"
    else:
        assert(False)
    
    cond_probs, probs = TrainNaiveBayesAlgorithm(labels, vocab, training_data, smoothing_value)

    f = open(file_name, 'w')
          
    correct_count = 0
    incorrect_count = 0
    
    yes_true_positive_count = 0
    yes_false_positive_count = 0
    yes_false_negative_count = 0
    yes_true_negative_count = 0
    
    no_true_positive_count = 0
    no_false_positive_count = 0
    no_false_negative_count = 0
    no_true_negative_count = 0
    
    for data in testing_data:
        scores = TestNaiveBayesAlgorithm(labels, probs, cond_probs, vocab, data[1])
        
        isYes = False
        if scores['yes']>scores['no']:
            yes_no_label = 'yes'
            isYes = True
            
        if isYes and data[2] == 'yes':
            correct_count = correct_count + 1
            correct_status = 'correct'
            yes_true_positive_count = yes_true_positive_count + 1
            no_true_negative_count = no_true_negative_count + 1
        elif not isYes and data[2] == 'no':
            correct_count = correct_count + 1
            correct_status = 'correct'
            yes_true_negative_count = yes_true_negative_count + 1
            no_true_positive_count = no_true_positive_count + 1
        elif not isYes and data[2] == 'yes':
            incorrect_count = incorrect_count + 1
            yes_false_negative_count = yes_false_negative_count + 1
            no_false_positive_count = no_false_positive_count + 1
        elif isYes and data[2] == 'no':
            incorrect_count = incorrect_count + 1
            yes_false_positive_count = yes_false_positive_count + 1
            no_false_negative_count = no_false_negative_count + 1
            
    accuracy = correct_count/len(testing_data)
    yes_precision = yes_true_positive_count/(yes_true_positive_count+yes_false_positive_count)
    no_precision = no_true_positive_count/(no_true_positive_count+no_false_positive_count)
    yes_recall = yes_true_positive_count/(yes_true_positive_count+yes_false_negative_count)
    no_recall = no_true_positive_count/(no_true_positive_count+no_false_negative_count)
    yes_f1 = 2*yes_precision*yes_recall/(yes_precision + yes_recall)
    no_f1 = 2*no_precision*no_recall/(no_precision + no_recall)
            
    f.write(str(round(accuracy,4)) + "\n")
    f.write(str(round(yes_precision, 4)) + "  " + str(round(no_precision,4))  + "\n" )
    f.write(str(round(yes_recall,4)) + "  " + str(round(no_recall,4)) + "\n")
    f.write(str(round(yes_f1,4)) + "  " + str(round(no_f1,4)) + "\n")


In [330]:
NaiveBayesAlgorithmTestPerformanceWithEvaluationOutputToFile(training_data, testing_data, 0.01, 'filtered')
NaiveBayesAlgorithmTestPerformanceWithEvaluationOutputToFile(training_data, testing_data, 0.01, 'original')
