In [1]:
%matplotlib inline
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
import string
import sys

In [2]:
movie_reviews_data_folder = './txt_sentoken/'
dataset = load_files(movie_reviews_data_folder, shuffle=False)

docs_train, docs_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=None)

# ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

# What is the probability that a review is positive/negative, given that a word is in it?

P(positive | 'word' ) = ( P(positive) * P( 'word' | positive ) ) / P('word')

In [3]:
stop = list(ENGLISH_STOP_WORDS)

def tf(file):
    file = str(file)
    file = file.split()
    doc_dict = set()
    for word in file:
        word = word.lower()

        for c in string.punctuation:
            word = word.replace(c,"")
    
        if word not in stop:
            if word not in doc_dict:
                doc_dict.add(word)
    
    return doc_dict

In [4]:
# create dictionary that counds occurance of every word in all reviews
word_count = {}
for x in docs_train:
    dic = tf(x)
    for word in dic:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
            
        
# create list of positive and negative reviews        
pos_reviews = []
neg_reviews = []
for review_idx in range(len(docs_train)):
    if y_train[review_idx] == 1:
        pos_reviews.append(docs_train[review_idx])
    else:
        neg_reviews.append(docs_train[review_idx])
     
    
    
# create dictionary that counds occurance of every word in all positive reviews
pos_word_count = {}
for x in pos_reviews:
    dic = tf(x)
    for word in dic:
        if word in pos_word_count:
            pos_word_count[word] += 1
        else:
            pos_word_count[word] = 1
        
# create dictionary that counds occurance of every word in all negative reviews
neg_word_count = {}
for x in neg_reviews:
    dic = tf(x)
    for word in dic:
        if word in neg_word_count:
            neg_word_count[word] += 1
        else:
            neg_word_count[word] = 1
            
# find total amount of words in all reviews
word_total = 0
for word in word_count:
    word_total += word_count[word]
    
# find total amount of words in all positive reviews
pos_word_total = 0
for word in pos_word_count:
    pos_word_total += pos_word_count[word]
    
# find total amount of words in all negative reviews
neg_word_total = 0
for word in neg_word_count:
    neg_word_total += neg_word_count[word]

In [5]:
def bayes(word):
    p_pos_review = 768/1500
    p_neg_review = 732/1500
    
    if word in word_count:
        p_word = word_count[word]/word_total
    else:
        print("Word not used in reviews, can not define probabilities")
        return(None,None)
    
    p_word_pos = pos_word_count[word]/pos_word_total
    p_word_neg = neg_word_count[word]/neg_word_total
    
    p_pos_word = (p_pos_review*p_word_pos)/p_word
    p_neg_word = (p_neg_review*p_word_neg)/p_word
    
    print("P(Positive | '"+word+"') = ", p_pos_word)
    print("P(Negative | '"+word+"') = ", p_neg_word)
    print()
    
    

In [6]:
bayes("awesome")
bayes("fun")
bayes("boring")

P(Positive | 'awesome') =  0.6811412615393257
P(Negative | 'awesome') =  0.3185139258469155

P(Positive | 'fun') =  0.5126484231585451
P(Negative | 'fun') =  0.487350254961258

P(Positive | 'boring') =  0.237568781463716
P(Negative | 'boring') =  0.7629906760270188



# What is the probability of a review being positive/negative?

In [7]:
def bayes2(word):
    p_pos_review = 768/1500
    p_neg_review = 732/1500
    
    if word in word_count:
        p_word = word_count[word]/word_total
    else:
        return(None,None)
    
    p_word_pos = pos_word_count[word]/pos_word_total
    p_word_neg = neg_word_count[word]/neg_word_total
    
    p_pos_word = (p_pos_review*p_word_pos)/p_word
    p_neg_word = (p_neg_review*p_word_neg)/p_word
    
    return(p_pos_word,p_neg_word)

In [8]:
def find_type_probability(review):
    review = str(review)
    review = review.split()
    p_pos = []
    p_neg = []
    for word in review:
        pos, neg = bayes2(word)
        if pos != None:
            p_pos.append(pos)
            p_neg.append(neg)
        
    p = 1
    pnot = 1
    for x in p_pos:
        p = p*x
        pnot = pnot*(1-x)
    
    prob_pos = p/(p+pnot)
    
    
    p = 1
    pnot = 1
    for x in p_neg:
        p = p*x
        pnot = pnot*(1-x)
    
    prob_neg = p/(p+pnot)
    
    print("P(Positive) = ",prob_pos)
    print("P(Negative) = ",prob_neg)
    print()

    
    

In [9]:
find_type_probability("That was a really great movie. I throughly enjoyed watching that movie")
find_type_probability("That was a really bad movie. I throughly hated watching that movie")

P(Positive) =  0.6109560334184939
P(Negative) =  0.3889213167109818

P(Positive) =  0.28501707183563396
P(Negative) =  0.7154992430474458

