In [1]:
from utils import process_utt, lookup
from nltk.corpus.reader import PlaintextCorpusReader
import numpy as np

In [2]:
my_corpus = PlaintextCorpusReader("./", ".*\.txt")

sents = my_corpus.sents(fileids="hamlet.txt")

In [3]:
def count_utts(result, utts, ys):
    """
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        utts: a list of utts
        ys: a list corresponding to the sentiment of each utt (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    """

    for y, utt in zip(ys, utts):
        for word in process_utt(utt):
            # define the key, which is the word and label tuple
            pair = (word, y)

            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1

    return result

In [4]:
result = {}
utts = [" ".join(sent) for sent in sents]
ys = [sent.count("be") > 0 for sent in sents]
count_utts(result, utts, ys)

{('the', False): 1283,
 ('project', False): 79,
 ('gutenberg', False): 83,
 ('ebook', False): 16,
 ('of', False): 786,
 ('hamlet', False): 120,
 ('by', False): 130,
 ('william', False): 3,
 ('shakespear', False): 30,
 ('thi', False): 335,
 ('is', False): 345,
 ('for', False): 200,
 ('use', False): 59,
 ('anyon', False): 3,
 ('anywher', False): 2,
 ('at', False): 90,
 ('no', False): 103,
 ('cost', False): 6,
 ('and', False): 673,
 ('with', False): 252,
 ('almost', False): 10,
 ('restrict', False): 2,
 ('whatsoev', False): 2,
 ('you', False): 465,
 ('may', False): 54,
 ('copi', False): 23,
 ('it', False): 337,
 ('give', False): 54,
 ('away', False): 24,
 ('or', False): 192,
 ('re', False): 5,
 ('under', False): 13,
 ('term', False): 28,
 ('licens', False): 15,
 ('includ', False): 13,
 ('onlin', False): 6,
 ('www', False): 7,
 ('org', False): 11,
 ('titl', False): 3,
 ('author', False): 12,
 ('editor', False): 1,
 ('charl', False): 7,
 ('kean', False): 6,
 ('releas', False): 2,
 ('date', 

In [12]:
freqs = count_utts({}, utts, ys)
lookup(freqs, "be", True)
for k, v in freqs.items():
    if "be" in k:
        print(f"{k}:{v}")

('be', True):207
('be', False):33


207

In [6]:
def train_naive_bayes(freqs, train_x, train_y):
    """
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of utts
        train_y: a list of labels correponding to the utts (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    """
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += lookup(freqs, pair[0], True)

        # else, the label is negative
        else:
            # increment the number of negative words by the count for this (word,label) pair
            N_neg += lookup(freqs, pair[0], False)

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents (*hint: use sum(<np_array>))
    D_pos = sum(train_y)

    # Calculate D_neg, the number of negative documents (*hint: compute using D and D_pos)
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)

    return logprior, loglikelihood

In [7]:
logprior, loglikelihood = train_naive_bayes(freqs, utts, ys)
print(logprior)
print(len(loglikelihood))

-2.684239552408491
4574


In [8]:
def naive_bayes_predict(utt, logprior, loglikelihood):
    """
    Input:
        utt: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the utt (if found in the dictionary) + logprior (a number)
    """
    # process the utt to get a list of words
    word_l = process_utt(utt)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:
        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

In [9]:
my_utt = "To be or not to be, that is the question."
p = naive_bayes_predict(my_utt, logprior, loglikelihood)
print("The expected output is", p)

The expected output is 2.9321000465209246


In [10]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of utts
        test_y: the corresponding labels for the list of utts
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of utts classified correctly)/(total # of utts)
    """
    accuracy = 0  # return this properly

    y_hats = []
    for utt in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(utt, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = sum([abs(y_hat - test) for y_hat, test in zip(y_hats, test_y)]) / len(
        y_hats
    )

    # Accuracy is 1 minus the error
    accuracy = 1 - error

    return accuracy

In [11]:
print(
    "Naive Bayes accuracy = %0.4f"
    % (test_naive_bayes(utts, ys, logprior, loglikelihood))
)

Naive Bayes accuracy = 0.9801
