# 1: Introduction

In [4]:
days = [["ran", "was tired"], ["ran", "was not tired"], ["didn't run", "was tired"], ["ran", "was tired"], 
        ["ran", "was tired"], ["didn't run", "was not tired"], ["ran", "was not tired"], ["ran", "was tired"], 
        ["ran", "was tired"]]

In [6]:
# P(A)
prob_tired = len([d for d in days if d[1] == "was tired"]) / len(days)

In [7]:
# P(B)
prob_ran = len([d for d in days if d[0] == "ran"]) / len(days)

In [8]:
# P(B|A)
prob_ran_given_tired = len([d for d in days if d[0] == "ran" and d[1] == "was tired"]) / len([d for d in days if d[1] == "was tired"])

In [15]:
# Calculate P(A|B)= (P(B|A)*P(A))/P(B)
prob_tired_given_ran = (prob_ran_given_tired * prob_tired) / prob_ran
print("Probability of being tired given that you ran: {0}".format(prob_tired_given_ran))

Probability of being tired given that you ran: 0.5247813411078716


# 2: Overview Of Naive Bayes

In [18]:
days = [["ran", "was tired", "woke up early"], ["ran", "was not tired", "didn't wake up early"],
        ["didn't run", "was tired", "woke up early"], ["ran", "was tired", "didn't wake up early"], 
        ["ran", "was tired", "woke up early"], ["didn't run", "was not tired", "didn't wake up early"], 
        ["ran", "was not tired", "woke up early"], ["ran", "was tired", "woke up early",], 
        ["ran", "was tired", "woke up early"]]

In [19]:
# We are trying to predict whether one was tired on a new day...
new_day = ["ran", "didn't wake up early"]

In [24]:
def calc_y_probability(y_label, days):
    return len([d for d in days if d[1] == y_label]) / len(days)

def calc_ran_probability_given_y(ran_label, y_label, days):
    return len([d for d in days if d[1] == y_label and d[0] == ran_label]) / len(days)

def calc_woke_early_probability_given_y(woke_label, y_label, days):
    return len([d for d in days if d[1] == y_label and d[2] == woke_label]) / len(days)

denominator = len([d for d in days if d[0] == new_day[0] and d[2] == new_day[1]]) / len(days)

prob_tired = (calc_y_probability("was tired", days) * calc_ran_probability_given_y(new_day[0], "was tired", days) *
             calc_woke_early_probability_given_y(new_day[1], "was tired", days)) / denominator

prob_not_tired = (calc_y_probability("was not tired", days) *
                 calc_ran_probability_given_y(new_day[0], "was not tired", days) * 
                 calc_woke_early_probability_given_y(new_day[1], "was not tired", days)) / denominator

classification = "was tired"
if prob_not_tired > prob_tired:
    classification = "was not tired"

print("Final classification for new day: {0}. Tired probability: {1}. Not tired probability: {2}.".format(classification,
                                                                                                          prob_ran, prob_not_tired))               

Final classification for new day: was tired. Tired probability: 0.7777777777777778. Not tired probability: 0.07407407407407407.


# 3: Finding Word Counts

In [26]:
from collections import Counter
import csv
import re

with open("trainnaivebayes.csv", 'r') as file:
    reviews = list(csv.reader(file))
    
def get_text(reviews, score):
    return " ".join([r[0].lower() for r in reviews if r[1] == str(score)])

def count_text(text):
    words = re.split("\s+", text)
    
negative_text = get_text(reviews, -1)
positive_text = get_text(reviews, 1)

negative_counts = count_text(negative_text)
positive_counts = count_text(positive_text)

print("Negative text sample: {0}.".format(negative_text[:100]))
print("Positive text sample: {0}.".format(positive_text[:100]))

Negative text sample: plot : two teen couples go to a church party drink and then drive . they get into an accident . one .
Positive text sample: films adapted from comic books have had plenty of success whether they're about superheroes ( batman.


# 4: Making Predictions About Review Classifications

In [40]:
import re 
from collections import Counter

def get_y_count(score):
    return len([r for r in reviews if r[1] == str(score)])

positive_review_count = get_y_count(1)
negative_review_count = get_y_count(-1)

prob_positive = positive_review_count / len(reviews)
prob_negative = negative_review_count / len(reviews)

def make_class_prediction(text, counts, class_prob, class_count):
    prediction = 1
    text_counts = Counter(re.split("\s+", text))
    for word in text_counts:
        prediction *= text_counts.get(word) * ((counts.get(word, 0) + 1 ) / (sum(counts.values()) + class_count))
        return prediction * class_prob
    
print("Review: {0}".format(reviews[0][0]))
print("Negative prediction: {0}".format(make_class_prediction(reviews[0][0], negative_counts, prob_negative, negative_review_count)))
print("Positive prediction: {0}".format(make_class_prediction(reviews[0][0], positive_counts, prob_positive, positive_review_count)))       

Review: plot : two teen couples go to a church party drink and then drive . they get into an accident . one of the guys dies but his girlfriend continues to see him in her life and has nightmares . what's the deal ? watch the movie and " sorta " find out . . . critique : a mind-fuck movie for the teen generation that touches on a very cool idea but presents it in a very bad package . which is what makes this review an even harder one to write since i generally applaud films which attempt


AttributeError: 'NoneType' object has no attribute 'get'

# 5: Predicting The Test Set

In [36]:
import csv 

def make_decision(text, make_class_prediction):
    negative_prediction = make_class_prediction(text, negative_counts, prob_negative, negative_review_count)
    positive_prediction = make_class_prediction(text, positive_counts, prob_positive, positive_review_count)
    
    if negative_prediction > positive_prediction:
        return -1
    return 1

with open("test.csv", 'r') as file:
    test = list(csv.reader(file))
    
pridictions = [make_decision(r[0], make_class_prediction) for r in test]

AttributeError: 'NoneType' object has no attribute 'get'

# 6: Computing Prediction Error

In [42]:
actual = [int(r[1]) for r in test]
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
print("AUC of the predictions: {0}".format(metrics.auc(fpr, tpr)))

ValueError: invalid literal for int() with base 10: 'season'

# 7: A Faster Way To Make Predictions

In [46]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

vectorizer = CountVectorizer(stop_words='english', max_df=.05)
train_features = vectorizer.fit_transform([r[0] for r in reviews])
test_features = vectorizer.transform([r[0] for r in reviews])

nb = MultinomialNB()
nb.fit(train_features, [int(r[1]) for r in reviews])

predictions = nb.predict(test_features)

fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)

print("Multinomial naive bayes AUC: {0}".format(metrics.auc(fpr, tpr)))

NameError: name 'actual' is not defined