In [1]:
import requests
import string
from collections import defaultdict
import numpy as np
import pandas as pd

In [2]:
master_url = "https://raw.githubusercontent.com/dennybritz/cnn-text-classification-tf/master/data/rt-polaritydata/rt-polarity."
pos_url = master_url + "pos"
neg_url = master_url + "neg"

resp_pos, resp_neg = requests.get(pos_url), requests.get(neg_url)

text_pos = resp_pos.text.strip().split("\n")
text_neg = resp_neg.text.strip().split("\n")

pos_dict = {k: 1 for k in text_pos}
neg_dict = {k: 0 for k in text_neg}

all_reviews = pos_dict | neg_dict

## 1)

In [3]:
# Shuffle data in dict, with seed
np.random.seed(2001)
all_reviews_dict = dict(sorted(all_reviews.items(), key=lambda item: np.random.rand()))

# Split into T/D/Test: 70/15/15
train_reviews = dict(list(all_reviews_dict.items())[:int(len(all_reviews_dict) * 0.7)])
dev_reviews = dict(list(all_reviews_dict.items())[int(len(all_reviews_dict) * 0.7):int(len(all_reviews_dict) * 0.85)])
test_reviews = dict(list(all_reviews_dict.items())[int(len(all_reviews_dict) * 0.85):])

In [4]:
# in format all data: train: 70%, dev: 15%, test: 15%
print(f"Total: {len(all_reviews_dict)} | Train: {len(train_reviews)}, Dev: {len(dev_reviews)}, Test: {len(test_reviews)}")

Total: 10662 | Train: 7463, Dev: 1599, Test: 1600


## 2)

In [5]:
stop_words = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
    "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being",
    "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't",
    "com", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down",
    "during", "each", "few", "for", "from", "further", "had", "hadn't", "has",
    "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her",
    "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's",
    "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it",
    "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my",
    "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other",
    "ought", "our", "ours", "that", "the", "this", "to", "was", "what", "when",
    "where", "who", "will", "with", "you"
]


In [6]:
def tokenize(text):
    tokens = [x for x in text.split() if x not in string.punctuation and len(x) > 1]
    for punct in string.punctuation:
        tokens = [token.replace(punct, '') for token in tokens]
    return tokens

In [7]:
class NaiveBayesClassifier:
    def __init__(self, train_data):
        self.train_reviews = train_data
        self.train_reviews_tokenized = [tokenize(review) for review in self.train_reviews.keys()]
        self.vocabulary = list(set(word for review in self.train_reviews_tokenized for word in review))
    
        self.all_train_pos_words = []
        self.all_train_neg_words = []
        for review, label in self.train_reviews.items():
            if label == 1: self.all_train_pos_words.extend(tokenize(review))
            elif label == 0: self.all_train_neg_words.extend(tokenize(review))

        self.pos_freq_dict = defaultdict(int)
        self.neg_freq_dict = defaultdict(int)
        for word in self.all_train_pos_words:
            self.pos_freq_dict[word] += 1
        for word in self.all_train_neg_words:
            self.neg_freq_dict[word] += 1

        # Remove stop words from frequency dictionaries
        for word in stop_words:
            if word in self.pos_freq_dict:
                del self.pos_freq_dict[word]
            if word in self.neg_freq_dict:
                del self.neg_freq_dict[word]

        # self.stop_word_n = 5
        # for word, _ in sorted(self.pos_freq_dict.items(), key=lambda item: item[1], reverse=True)[:self.stop_word_n]:
        #     del self.pos_freq_dict[word]
        # for word, _ in sorted(self.neg_freq_dict.items(), key=lambda item: item[1], reverse=True)[:self.stop_word_n]:
        #     del self.neg_freq_dict[word]

    def calculate_priors(self, reviews):
        positive_prior = sum(reviews.values()) / len(reviews)
        negative_prior = 1 - positive_prior
        return np.log(positive_prior), np.log(negative_prior)

    def calculate_conditional_probabilities(self, word, label, alpha=1):
        vocab_size = len(self.vocabulary)

        num = self.pos_freq_dict[word] + alpha if label == 1 else self.neg_freq_dict[word] + alpha
        denom = len(self.pos_freq_dict) + alpha * vocab_size if label == 1 else len(self.neg_freq_dict) + alpha * vocab_size
        
        return np.log(num / denom)

    def estimate_label_posterior(self, review, label, alpha):
        tokens = tokenize(review)
    
        pos_prior, neg_prior = self.calculate_priors(train_reviews)
        prior = pos_prior if label == 1 else neg_prior

        result = prior
        for token in tokens:
            result += self.calculate_conditional_probabilities(token, label, alpha=alpha)
    
        return result

    def naive_bayes_prediction(self, review, alpha):
        pos_posterior = self.estimate_label_posterior(review, 1, alpha=alpha)
        neg_posterior = self.estimate_label_posterior(review, 0, alpha=alpha)
        result = np.argmax([neg_posterior, pos_posterior])
        return result, (neg_posterior, pos_posterior)
        
    def prediction_accuracy(self, predictions, actuals):
        accuracy = sum(1 for pred, actual in zip(predictions, actuals) if pred == actual) / len(actuals)
        return accuracy

    def evaluate_on_dataset(self, reviews, alpha):
        predictions = []
        for review in reviews.keys():
            pred_label, (neg_post, pos_post) = self.naive_bayes_prediction(review, alpha=alpha)
            predictions.append((int(pred_label), neg_post, pos_post))
        actuals = list(reviews.values())
        accuracy = self.prediction_accuracy([pred for pred, _, _ in predictions], actuals)
        return accuracy

In [8]:
alpha = 1
naive_bayes_classifier = NaiveBayesClassifier(train_data = train_reviews)

accuracy_on_train = naive_bayes_classifier.evaluate_on_dataset(train_reviews, alpha=alpha)
accuracy_on_dev = naive_bayes_classifier.evaluate_on_dataset(dev_reviews, alpha=alpha)

print(f"Naive Bayes Accuracy on Train: {accuracy_on_train:.4f}")
print(f"Naive Bayes Accuracy on Dev: {accuracy_on_dev:.4f}")

Naive Bayes Accuracy on Train: 0.9456
Naive Bayes Accuracy on Dev: 0.7498


## 3)

In [9]:
naive_bayes_classifier_train_dev = NaiveBayesClassifier(train_data = train_reviews | dev_reviews)
accuracy_on_train = naive_bayes_classifier_train_dev.evaluate_on_dataset(train_reviews | dev_reviews, alpha=alpha)
accuracy_on_test = naive_bayes_classifier_train_dev.evaluate_on_dataset(test_reviews, alpha=alpha)
print(f"Naive Bayes Accuracy on Train & Dev: {accuracy_on_train:.4f}")
print(f"Naive Bayes Accuracy on Test: {accuracy_on_test:.4f}")

Naive Bayes Accuracy on Train & Dev: 0.9385
Naive Bayes Accuracy on Test: 0.7788


## 4)

In [10]:
posterior_list = []
for test_review, test_label in test_reviews.items():
    pred_label, (neg_post, pos_post) = naive_bayes_classifier_train_dev.naive_bayes_prediction(test_review, alpha=alpha)
    posterior_list.append((test_review, test_label, pred_label, pos_post, neg_post))
posterior_df = pd.DataFrame(posterior_list, columns=["review", "actual_label", "predicted_label", "pos_posterior", "neg_posterior"])

In [11]:
# Certain
posterior_df["diff"] = posterior_df["pos_posterior"] - posterior_df["neg_posterior"]
certain_df = posterior_df.sort_values(by="diff", ascending=False)
certain_df.tail(10)

Unnamed: 0,review,actual_label,predicted_label,pos_posterior,neg_posterior,diff
225,the redeeming feature of chan's films has alwa...,0,0,-215.712143,-205.291209,-10.420934
331,"at every opportunity to do something clever , ...",0,0,-390.176554,-379.732368,-10.444186
319,though perry and hurley make inspiring efforts...,0,0,-314.030161,-303.548814,-10.481347
238,"cherry orchard is badly edited , often awkward...",0,0,-247.273699,-236.279844,-10.993855
1413,the plot is nothing but boilerplate clichés fr...,0,0,-401.303933,-389.960881,-11.343052
993,the nicest thing that can be said about steali...,0,0,-305.104829,-293.019525,-12.085304
1417,some movies can get by without being funny sim...,0,0,-344.195596,-331.26873,-12.926866
1579,"swimfan , like fatal attraction , eventually g...",0,0,-187.729984,-174.428432,-13.301552
926,it's just too bad the screenwriters eventually...,0,0,-273.238084,-259.536443,-13.701641
1106,in addition to sporting one of the worst title...,0,0,-296.355275,-277.063616,-19.291659


In [12]:
# Uncertain
pd.set_option("display.max_colwidth", None)
posterior_df["abs_diff"] = posterior_df["diff"].abs()
uncertain_df = posterior_df.sort_values(by="abs_diff", ascending=True)
uncertain_df.head(20)

Unnamed: 0,review,actual_label,predicted_label,pos_posterior,neg_posterior,diff,abs_diff
912,"wilco fans will have a great time , and the movie should win the band a few new converts , too .",1,0,-143.165207,-143.164496,-0.000711,0.000711
145,a workshop mentality prevails .,0,0,-31.057775,-31.051611,-0.006164,0.006164
754,topkapi this is not .,0,0,-43.026681,-43.020518,-0.006164,0.006164
1575,"but the power of these [subjects] is obscured by the majority of the film that shows a stationary camera on a subject that could be mistaken for giving a public oration , rather than contributing to a film's narrative .",0,0,-311.076654,-311.070217,-0.006437,0.006437
917,"a perceptive , good-natured movie .",1,1,-23.211576,-23.21811,0.006534,0.006534
1272,watching the film is like reading a times portrait of grief that keeps shifting focus to the journalist who wrote it .,0,0,-178.321244,-178.313374,-0.00787,0.00787
1268,"freundlich's made [crudup] a suburban architect , and a cipher .",0,0,-66.640636,-66.630957,-0.009679,0.009679
126,"despite the evocative aesthetics evincing the hollow state of modern love life , the film never percolates beyond a monotonous whine .",0,0,-165.925927,-165.914344,-0.011583,0.011583
1090,no film could possibly be more contemptuous of the single female population .,0,1,-114.143245,-114.160499,0.017254,0.017254
746,"expect to be reminded of other , better films , especially seven , which director william malone slavishly copies .",0,0,-141.476367,-141.459062,-0.017304,0.017304


## 5)

In [14]:
# # find features (words) with highest conditional probabilities for pos and neg classes
pos_words = list(naive_bayes_classifier_train_dev.pos_freq_dict.keys())
all_pos_conditional_probs = {word: naive_bayes_classifier_train_dev.calculate_conditional_probabilities(word, 1, alpha=alpha) for word in pos_words}
sorted_pos_conditional_probs = dict(sorted(all_pos_conditional_probs.items(), key=lambda item: item[1], reverse=True))
print("Top 10 words with highest conditional probabilities for positive class:")
for word, prob in list(sorted_pos_conditional_probs.items()):
    print(f"{word}: {prob:.6f}")

# neg_words = list(naive_bayes_classifier_train_dev.neg_freq_dict.keys())
# all_neg_conditional_probs = {word: naive_bayes_classifier_train_dev.calculate_conditional_probabilities(word, 0, alpha=alpha) for word in neg_words}
# sorted_neg_conditional_probs = dict(sorted(all_neg_conditional_probs.items(), key=lambda item: item[1], reverse=True))
# print("\nTop 10 words with highest conditional probabilities for negative class:")
# for word, prob in list(sorted_neg_conditional_probs.items()):
#     print(f"{word}: {prob:.6f}")

Top 10 words with highest conditional probabilities for positive class:
film: -4.022997
movie: -4.528173
one: -4.833219
: -4.940705
like: -5.106148
than: -5.135874
story: -5.175440
so: -5.406462
good: -5.464618
comedy: -5.476666
funny: -5.532756
their: -5.539187
films: -5.552174
out: -5.558731
can: -5.578666
even: -5.585400
some: -5.619767
way: -5.669957
us: -5.684772
up: -5.684772
best: -5.692263
make: -5.699810
just: -5.699810
characters: -5.699810
love: -5.722799
time: -5.730582
much: -5.730582
life: -5.738425
makes: -5.762330
may: -5.778591
work: -5.786821
movies: -5.803488
very: -5.803488
performances: -5.829022
your: -5.829022
while: -5.864113
little: -5.873082
enough: -5.882132
director: -5.919173
there: -5.938221
well: -5.957639
still: -5.977442
great: -6.007901
look: -6.007901
new: -6.028735
those: -6.039317
fun: -6.039317
drama: -6.039317
never: -6.050012
they: -6.093976
we: -6.093976
which: -6.105275
see: -6.163771
through: -6.163771
thats: -6.163771
performance: -6.188163
e