In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read CSV
fresh_rotten_data_set = pd.read_csv('./rt_reviews.csv',names=['Label', 'Review'], encoding="windows_1258", header=0, nrows = 40000)
print("Examples of the data samples \n", fresh_rotten_data_set.head(3), "\n")
print("Dimension of the data set:\n", fresh_rotten_data_set.shape, "\n")
print("Distribution of the data set:\n", fresh_rotten_data_set['Label'].value_counts(normalize=True), "\n")

Examples of the data samples 
     Label                                             Review
0   fresh   Manakamana doesn't answer any questions, yet ...
1   fresh   Wilfully offensive and powered by a chest-thu...
2  rotten   It would be difficult to imagine material mor... 

Dimension of the data set:
 (40000, 2) 

Distribution of the data set:
 Label
fresh     0.500175
rotten    0.499825
Name: proportion, dtype: float64 



In [3]:
# Perform train/test split
review_texts, labels = fresh_rotten_data_set.Review, fresh_rotten_data_set.Label
from sklearn.model_selection import train_test_split
review_texts_train, review_texts_test, labels_train, labels_test = train_test_split(review_texts, labels, test_size=0.2, random_state=123)

review_texts_train = review_texts_train.reset_index(drop=True)
labels_train = labels_train.reset_index(drop=True)

review_texts_test = review_texts_test.reset_index(drop=True)
labels_test = labels_test.reset_index(drop=True)

print("Distribtuion of the training data set:\n", labels_train.value_counts(normalize=True),  labels_train.shape[0], "\n")

print("Distribtuion of the testing data set:\n", labels_test.value_counts(normalize=True), labels_test.shape[0], "\n")


Distribtuion of the training data set:
 Label
fresh     0.500062
rotten    0.499937
Name: proportion, dtype: float64 32000 

Distribtuion of the testing data set:
 Label
fresh     0.500625
rotten    0.499375
Name: proportion, dtype: float64 8000 



In [4]:
def reviewTextToVectors(review_texts):
    review_texts = review_texts.str.replace('\W', '')
    review_texts = review_texts.str.lower()
    review_texts = review_texts.str.split()

    vocabulary = []
    for review in review_texts:
        for word in review:
            vocabulary.append(word)
    
    # list -> set for get only unique value
    vocabulary = list(set(vocabulary))

    
    word_counts_per_review = {}
    # i = 0
    for unique_word in vocabulary:
        word_counts_per_review[unique_word] = [0] * len(review_texts)
        # i += 1
        # if i % 1000 == 0:
        #     print(i)
    print("finish----------------------------------------------------------------", len(review_texts) * len(review_texts[0]))
    
    for index, review in enumerate(review_texts):
        for word in review:
            word_counts_per_review[word][index] += 1

    return word_counts_per_review, vocabulary


    # word_counts_per_review = {unique_word: [0] * len(review_texts) for unique_word in vocabulary}
    # print(type(word_counts_per_review))

    # print(word_counts_per_review)

    # return word_counts_per_review



word_counts_per_review, vocabulary = reviewTextToVectors(review_texts_train)
x_train = pd.DataFrame(word_counts_per_review)
print("Features (the number of all possible words in the trainning data):\n", len(vocabulary), "\n")


training_data_set = pd.concat([labels_train, review_texts_train, x_train], axis=1)
print("Examples of the training data \n", training_data_set.head(3), "\n")

finish---------------------------------------------------------------- 352000
Features (the number of all possible words in the trainning data):
 66084 

Examples of the training data 
     Label                                             Review  sea"  \
0  rotten   This interminable farrago feels like swimming...     0   
1  rotten   [Ferrell's] humor is cut off at the knees by ...     0   
2   fresh   The IRA-flavored Death Wish remake we didn't ...     0   

   underplays  depp  preferable.  lows;  infinite,  lady  slums,  ...  \
0           0     0            0      0          0     0       0  ...   
1           0     0            0      0          0     0       0  ...   
2           0     0            0      0          0     0       0  ...   

   english,  off-camera.  arouses  plotnick's  strangeness,  hurry,  relieved  \
0         0            0        0           0             0       0         0   
1         0            0        0           0             0       0         0 

In [5]:
print("Features (the number of all possible words in the trainning data):\n", len(vocabulary), "\n")


# training_data_set = pd.concat([labels_train, review_texts_train, x_train], axis=1)
print("Examples of the training data \n", training_data_set.head(3), "\n")

Features (the number of all possible words in the trainning data):
 66084 

Examples of the training data 
     Label                                             Review  sea"  \
0  rotten   This interminable farrago feels like swimming...     0   
1  rotten   [Ferrell's] humor is cut off at the knees by ...     0   
2   fresh   The IRA-flavored Death Wish remake we didn't ...     0   

   underplays  depp  preferable.  lows;  infinite,  lady  slums,  ...  \
0           0     0            0      0          0     0       0  ...   
1           0     0            0      0          0     0       0  ...   
2           0     0            0      0          0     0       0  ...   

   english,  off-camera.  arouses  plotnick's  strangeness,  hurry,  relieved  \
0         0            0        0           0             0       0         0   
1         0            0        0           0             0       0         0   
2         0            0        0           0             0       0        

In [6]:
x_train_rotten = x_train[labels_train == 'rotten']
x_train_fresh = x_train[labels_train == 'fresh']

#Estimate P(y=rotten) and P(y=fresh)
p_rotten = len(x_train_rotten)/len(x_train)
print("Our estimate of P(y=rotten) is ", p_rotten)

p_fresh = len(x_train_fresh)/len(x_train)
print("Our estimate of P(y=fresh) is ", p_fresh)


Our estimate of P(y=rotten) is  0.4999375
Our estimate of P(y=fresh) is  0.5000625


In [7]:
#Initiate parameters
theta_rotten = {unique_word:0 for unique_word in vocabulary}
theta_fresh =  {unique_word:0 for unique_word in vocabulary}

#Estimate the probability distribution of selecting each word
rotten_word_count = np.sum(x_train_rotten.values)
fresh_word_count = np.sum(x_train_fresh.values)
for word in vocabulary:
    theta_rotten[word] = (sum(x_train_rotten[word]) + 1) / (rotten_word_count +1)
    theta_fresh[word] = (sum(x_train_fresh[word]) +1) / (fresh_word_count +1)

In [8]:
import re, math

In [9]:
def textToVector(message):
    message = re.sub('\W', ' ', message) #Remove punctuation (comment: re.sub() is like a str.replace())
    message = message.lower().split()

    vocabulary = []
    for word in message:
        vocabulary.append(word)
    vocabulary = list(set(vocabulary))

    word_counts = {unique_word: 0 for unique_word in vocabulary}

    for word in message:
        word_counts[word] += 1

    return word_counts, vocabulary

In [10]:
import math

In [22]:
def naive_bayes_classify(review_texts):
    x_test, vocabulary = textToVector(review_texts)

    p_rotten_given_review = p_rotten
    p_fresh_given_review = p_fresh

    for word in x_test:
        if word in theta_rotten:
            p_rotten_given_review *= theta_rotten[word]
        if word in theta_fresh:
            p_fresh_given_review *= theta_fresh[word]


    p_rotten_given_review = math.log(p_rotten_given_review)
    p_fresh_given_review =  math.log(p_fresh_given_review)
    # print('Estimate of log(P(rotten|message=',  review_texts, ')) =', p_rotten_given_review)
    # print('Estimate of log(P(fresh|message=',  review_texts, ')) =', p_fresh_given_review)

    isRotten = True
    if(p_rotten_given_review > p_fresh_given_review):
        isRotten = True
    else:
        isRotten = False
    return isRotten

In [23]:
def score(review_texts, labels):
    mistakes = 0
    for i, message in enumerate(review_texts):
        isRotten = naive_bayes_classify(message)
        if isRotten and labels[i] != "rotten":
            mistakes += 1
        elif not isRotten and labels[i] == "rotten":
            mistakes += 1
    return (len(review_texts)-mistakes)/len(review_texts)

In [24]:
#Calculate loss on training data
print("Training accuracy:", score(review_texts_train, labels_train))
#Calculate generalization loss
print("Generalization accuracy:", score(review_texts_test, labels_test))

Training accuracy: 0.85703125
Generalization accuracy: 0.758625


In [25]:
def predict(review):
    if naive_bayes_classify(review) == True:
        print("rotten: ",review)
    else:
        print("fresh: ",review)
    return

In [26]:
predict("Godzilla X Kong: The New Empire is a fun, albeit flawed, addition to the Monsterverse.")
predict("Dumbed-down and stripped of the symbolic subtext of the earlier movies, the picture is not without seat-shuddering thrills, but it’s like a tag-team wrestling bout for monsters rather than a picture with meaning and even a modicum of thought.")


fresh:  Godzilla X Kong: The New Empire is a fun, albeit flawed, addition to the Monsterverse.
rotten:  Dumbed-down and stripped of the symbolic subtext of the earlier movies, the picture is not without seat-shuddering thrills, but it’s like a tag-team wrestling bout for monsters rather than a picture with meaning and even a modicum of thought.
