In [1]:
#===============================
# From pure python to notebook for easier read!
#===============================
import numpy as np
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize
import operator

Naive Bayes says a class $c$ is given as

\begin{equation}
c = P(c) + \sum log( P(w_i | c)
\end{equation}
Where $P(c)$ is prior for the class and $P(w_i | c)$ is probability for the word, when we know the class

In [2]:
# ================
# Assignment 6.1
# What  class  will  Naive  bayes  assign  to  the  sentence  “I  always  like  foreign films.”

# Probabilities:
# word     pos    neg
# -------------------
# I        0.09  0.16
# always   0.07  0.06
# like     0.29  0.06
# foreign  0.04  0.15
# films    0.08  0.11
# ================

p_pos  = np.log2(0.09) + np.log2(0.07) + np.log2(0.29) + np.log2(0.04) + np.log2(0.08)
p_neg = np.log2(0.16) + np.log2(0.06) +  np.log2(0.06) + np.log2(0.15) + np.log2(0.11)

# Result!
print("Pos: ", p_pos , " Neg: ", p_neg)

Pos:  -17.384020030246134  Neg:  -16.683033733185496


In [3]:
# ================
# Assignment 6.2
# Given the following short movie reviews,  each labeled with a genre,  either comedy or action:
# What is the most likely class for document [fast, couple, shoot, fly]
# ================

# Set the priors!
prior_comedy = 2 / 5
prior_action = 3 / 5

# Documents (notice multiple occurances)
action_sentances = ["fast", "furious", "shoot", "furious", "shoot", "shoot", "fun", "fly", "fast", "shoot", "love"]
comdey_sentances = ["fun", "couple", "love", "love", "couple", "fly", "fast", "fun", "fun"]
document = ["fast", "couple", "shoot", "fly"]

# Calculate how many times word occurs
action_unigram = Counter(action_sentances)
comedy_unigram = Counter(comdey_sentances)
total_unigram = Counter(action_sentances + comdey_sentances)

# Initialise probabilities with priors
action_posterior = np.log(prior_action)
comedy_posterior = np.log(prior_comedy)
action_count = sum(action_unigram.values())
comedy_count = sum(comedy_unigram.values())

# Loop over the query document!
for word in document:
    action_posterior += np.log((action_unigram[word] + 1) / (action_count + len(total_unigram)))
    comedy_posterior += np.log((comedy_unigram[word] + 1) / (comedy_count + len(total_unigram)))

# Result!
print("Action: ", action_posterior, " Comedy: ", comedy_posterior)

Action:  -8.671115273688494  Comedy:  -9.52173897104528


In [4]:
# ================
# Assignment 6.3
# Multinomial and binary bayes --- Do they agree!?
# ================

sentance = "A good good plot and great characters but poor acting".split()
classes = ["pos", "neg"]
documents = [
    ("pos",["good", "good", "good", "great", "great", "great"]),
    ("pos",["poor", "great", "great"]),
    ("neg",["good", "poor", "poor", "poor"]),
    ("neg",["good", "poor", "poor", "poor", "poor", "poor", "great", "great"]),
    ("neg",["poor", "poor"])
]

# Calculate counts in the documents
counters = {}
counters_bin = {}
counters["pos"] = Counter()
counters["neg"] = Counter()
counters_bin["pos"] = Counter()
counters_bin["neg"] = Counter()
for C in classes:
    for p, document in documents:
        if C == p:
            counters[C] += Counter(document)
            counters_bin[C] += Counter(set(document))

counters["tot"] = counters["neg"] + counters["pos"]
counters_bin["tot"] = counters_bin["neg"] + counters_bin["pos"]

# Start calculating the posteriors
pos_posterior = np.log2(2/5)
pos_bin_posterior = np.log2(2/5)
neg_posterior = np.log2(3/5)
neg_bin_posterior = np.log2(3/5)

# Needed for word probabilities
pos_count = sum(counters["pos"].values())
neg_count = sum(counters["neg"].values())
pos_bin_count = sum(counters_bin["pos"].values())
neg_bin_count = sum(counters_bin["neg"].values())


for word in sentance:
    # Make sure word is known!
    if word in counters["tot"].keys():
        pos_posterior += np.log2( (counters["pos"][word] + 1) / (pos_count + len(counters["tot"])))
        neg_posterior += np.log2( (counters["neg"][word] + 1) / (neg_count + len(counters["tot"])))

        pos_bin_posterior += np.log2( (counters_bin["pos"][word] + 1) / (pos_bin_count + len(counters_bin["tot"])))
        neg_bin_posterior += np.log2( (counters_bin["neg"][word] + 1) / (neg_bin_count + len(counters_bin["tot"])))
        
print("Positive: ", pos_posterior, " Negative: ", neg_posterior)
print("=================")
print("Positive (bin): ", pos_bin_posterior, " Negative: (bin)", neg_bin_posterior)

Positive:  -8.076815597050832  Negative:  -8.872497838366797
Positive (bin):  -7.966385282396622  Negative: (bin) -7.246740598493144


In [5]:
# ====================
# HW - Predict the centiment of the product with NB classifier!
# 
# Using product review dataset for sentiment analysis
# http://people.mpi-inf.mpg.de/~smukherjee/data/
# ====================

def train_nb(all_documents, class_documents, C):
    logprior = {}
    likelihood = {}
    # Make vocabulary of all documents!
    vocabulary = set()
    word_sum = 0
    for review in all_documents:
        for word in review:
            vocabulary.add(word)
            
    # Loop all possible classes
    for c in C:
        N_docs = len(all_documents)
        N_class_docs = len(class_documents[c])
        logprior[c] = np.log2( N_class_docs/ N_docs )
        
        likelihood[c] = {}
        # Calculate how many times a certain word appears with the class
        counts = Counter([])
        for review in class_documents[c]:
            counts += Counter(ngrams(review, 1))
            
        word_sum = sum(counts.values())
        # Loop all the words and calculate log-likelihoods
        for word in vocabulary:
            class_count = counts[(word,)]
            likelihood[c][word] = np.log2((class_count + 1) / (word_sum + len(vocabulary)))
        
    return logprior, likelihood, vocabulary
    
def test_nb(document, logprior, likelihood, C, V):
    total_sums = {}
    # Loop all the classes and calculate sum for all the words
    for c in C:
        total_sums[c] = logprior[c]
        for word in document:
            if word in V:
                total_sums[c] = total_sums[c] + likelihood[c][word]
   
    return total_sums
    
path = "Dataset2.txt"
negative_reviews = []
positive_reviews = []
classes = ["pos", "neg"]

with open(path) as f:
    for line in f.readlines():
        review = line.split('$')
        tokenized = word_tokenize(review[2].lower())
        
        if review[1].strip() == "neg":
            negative_reviews.append(tokenized)
        else:
            positive_reviews.append(tokenized)

# Split the data!
positive_train = positive_reviews[: int(len(positive_reviews) * 0.9)]
negative_train = negative_reviews[: int(len(negative_reviews) * 0.9)]
positive_test = positive_reviews[int(len(positive_reviews) * 0.9) + 1 :]
negative_test = negative_reviews[int(len(negative_reviews) * 0.9) + 1 :]

# I dont even know
all_reviews = negative_train + positive_train
review_dictionary = {}
review_dictionary["neg"] = negative_train
review_dictionary["pos"] = positive_train

# Train the classifier!
(log_priors, log_likelihoods, V) = train_nb(all_reviews, review_dictionary, classes)

# Test the classifier!

correct_positive = 0
for test_item in positive_test:
    likelihoods = test_nb(test_item,  log_priors, log_likelihoods, classes, V)
    outcome = max(likelihoods.items(), key=operator.itemgetter(1))[0]
    if outcome == "pos":
        correct_positive += 1

correct_negative = 0
for test_item in negative_test:
    likelihoods = test_nb(test_item,  log_priors, log_likelihoods, classes, V)
    outcome = max(likelihoods.items(), key=operator.itemgetter(1))[0]
    if outcome == "neg":
        correct_negative += 1

print("positives comments rightly classified:  ", correct_positive / len(positive_test))
print("negative comments rightly classified:  ", correct_negative / len(negative_test))

positives comments rightly classified:   0.7805907172995781
negative comments rightly classified:   0.7534246575342466
