In [70]:
import pandas as pd
import numpy as np
import string
import re
from collections import defaultdict

In [71]:
reviews = pd.read_csv('rt_reviews.csv', encoding='latin1')
print(reviews)

       Freshness                                             Review
0          fresh   Manakamana doesn't answer any questions, yet ...
1          fresh   Wilfully offensive and powered by a chest-thu...
2         rotten   It would be difficult to imagine material mor...
3         rotten   Despite the gusto its star brings to the role...
4         rotten   If there was a good idea at the core of this ...
...          ...                                                ...
479995    rotten   Zemeckis seems unable to admit that the motio...
479996     fresh   Movies like The Kids Are All Right -- beautif...
479997    rotten   Film-savvy audiences soon will catch onto Win...
479998     fresh                        An odd yet enjoyable film. 
479999     fresh   No other animation studio, even our beloved P...

[480000 rows x 2 columns]


In [99]:
train_size = int(len(reviews) * 0.6)
dev_size = int(len(reviews) * 0.2)
test_size = len(reviews) - train_size - dev_size

train_data = reviews[:train_size]
dev_data = reviews[train_size:train_size+dev_size]
test_data = reviews[train_size+dev_size:]
print(train_data)
print(dev_data)
print(test_data)

       Freshness                                             Review
0          fresh   Manakamana doesn't answer any questions, yet ...
1          fresh   Wilfully offensive and powered by a chest-thu...
2         rotten   It would be difficult to imagine material mor...
3         rotten   Despite the gusto its star brings to the role...
4         rotten   If there was a good idea at the core of this ...
...          ...                                                ...
287995     fresh   [T]here's some remarkable psychological insig...
287996     fresh   It's a sports film nonsports fans can love; i...
287997     fresh                          A solid, well-acted tale.
287998     fresh   Not a film for cynics, but a creation of shin...
287999     fresh   Simply put, if you like films about boxing, w...

[288000 rows x 2 columns]
       Freshness                                             Review
288000    rotten   A tepid attempt at making an alien abduction ...
288001     fresh   a 

In [100]:
print(train_data.columns)
print(dev_data.columns)
print(test_data.columns)

Index(['Freshness', 'Review'], dtype='object')
Index(['Freshness', 'Review'], dtype='object')
Index(['Freshness', 'Review'], dtype='object')


In [74]:
print(train_data.head())
print(dev_data.head())
print(test_data.head())

  Freshness                                             Review
0     fresh   Manakamana doesn't answer any questions, yet ...
1     fresh   Wilfully offensive and powered by a chest-thu...
2    rotten   It would be difficult to imagine material mor...
3    rotten   Despite the gusto its star brings to the role...
4    rotten   If there was a good idea at the core of this ...
       Freshness                                             Review
336000     fresh   Ultimate X is a ride, basically the kind of g...
336001    rotten   Viewers will be mystified by the existence of...
336002     fresh   The story might be familiar but the setting i...
336003    rotten   A pleasant time-waster with non-abysmal perfo...
336004     fresh   The difficulty and the key lies in taking a l...
       Freshness                                             Review
408000    rotten   It's execrable; a fist-chewing embarrassment ...
408001     fresh   This deceptively simple documentary by Jean-F...
408002    

In [75]:
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    text = ' '.join([word for word in text.split()])
    return text

train_data['text'] = train_data['Review'].apply(preprocess)
dev_data['text'] = dev_data['Review'].apply(preprocess)
test_data['text'] = test_data['Review'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['text'] = train_data['Review'].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_data['text'] = dev_data['Review'].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['text'] = test_data['Review'].apply(preprocess)


In [76]:
train_data_copy = train_data.copy()
train_data_copy['text'] = train_data_copy['Review'].apply(preprocess)
print(train_data_copy['text']);

dev_data_copy = dev_data.copy()
dev_data_copy['text'] = dev_data_copy['Review'].apply(preprocess)

test_data_copy = test_data.copy()
test_data_copy['text'] = test_data_copy['Review'].apply(preprocess)


0         manakamana doesnt answer any questions yet mak...
1         wilfully offensive and powered by a chestthump...
2         it would be difficult to imagine material more...
3         despite the gusto its star brings to the role ...
4         if there was a good idea at the core of this f...
                                ...                        
335995    the main problem is that this bostonset flick ...
335996    all in all the ï½ï½ï½150minute mr turner is an...
335997    the family stone introduces a fun and agreeabl...
335998    the paintbynumbers portrayals of 1960s iconic ...
335999    as aroonpheng slowly but surely breaks our hea...
Name: text, Length: 336000, dtype: object


In [77]:
vocabulary = []
for text in train_data['text']:
    for word in text.split():
        if word not in vocabulary:
            vocabulary.append(word)

In [78]:
word_index = {}
for i, word in enumerate(vocabulary):
    word_index[word] = i

In [79]:
doc_count = defaultdict(int)
sent_count = defaultdict(lambda: defaultdict(int))

for i, row in train_data.iterrows():
    sentiment = row['Freshness'].strip().lower()  # normalize the sentiment value
    doc_count_total = 0
    for word in row['text'].split():
        doc_count[word] += 1
        sent_count[sentiment][word] += 1
        doc_count_total += 1
    # update the sentiment count
    sent_count[sentiment]['__total__'] += 1

# calculate the probability of each word
prob_word = {}
for word, count in doc_count.items():
    if count >= 5:
        prob_word[word] = count / len(train_data)


In [18]:
# calculate the conditional probability of each word given each sentiment
cond_prob = {}
for word in prob_word.keys():
    cond_prob[word] = {}
    for sentiment in sent_count.keys():
        num = sent_count[sentiment][word] + 1
        denom = sent_count[sentiment]['__total__'] + len(prob_word)
        cond_prob[word][sentiment] = num / denom


In [101]:
sentiment = row['Freshness']

print('Number of all the documents containing THE', cond_prob['the'][sentiment])

print('Number of documents containing word THE in positive documents', cond_prob['the']['fresh'])

Number of all the documents containing THE 0.9294209566802674
Number of documents containing word THE in positive documents 0.9294209566802674


In [96]:
def classify(text):
    # calculate the log probability of the text being fresh and rotten
    log_prob_fresh = np.log(sent_count['Fresh']['__total__'] / len(train_data))
    log_prob_rotten = np.log(sent_count['Rotten']['__total__'] / len(train_data))
    
    for word in text.split():
        if word in cond_prob:
            log_prob_fresh += np.log(cond_prob[word]['Fresh'])
            log_prob_rotten += np.log(cond_prob[word]['Rotten'])
    
    if log_prob_fresh > log_prob_rotten:
        return 'Fresh'
    else:
        return 'Rotten'


In [97]:
# count the number of documents containing each word
doc_count = defaultdict(int)
# count the number of documents containing each word for each sentiment
sent_count = {'fresh': defaultdict(int), 'rotten': defaultdict(int)}

for i, row in train_data.iterrows():
    sentiment = row['Freshness'].lower()
    doc_count_total = 0
    for word in row['text'].split():
        doc_count[word] += 1
        sent_count[sentiment][word] += 1
        doc_count_total += 1
    # update the sentiment count
    sent_count[sentiment]['__total__'] += 1

# calculate the probability of each word
prob_word = {}
for word, count in doc_count.items():
    if count >= 5:
        prob_word[word] = count / len(train_data)

# calculate the conditional probability of each word given each sentiment
cond_prob = {}
for word, count in prob_word.items():
    cond_prob[word] = {'fresh': 0, 'rotten': 0}
    for sentiment in sent_count:
        cond_prob[word][sentiment] = (sent_count[sentiment][word] + 1) / (sent_count[sentiment]['__total__'] + len(prob_word))

def classify(text, cond_prob):
    # calculate the log probability of the text being fresh and rotten
    log_prob_fresh = np.log(sent_count['fresh']['__total__'] / len(train_data))
    log_prob_rotten = np.log(sent_count['rotten']['__total__'] / len(train_data))
    
    for word in text.split():
        if word in cond_prob:
            log_prob_fresh += np.log(cond_prob[word]['fresh'])
            log_prob_rotten += np.log(cond_prob[word]['rotten'])
    
    if log_prob_fresh > log_prob_rotten:
        return 'fresh'
    else:
        return 'rotten'

correct_count = 0
total_count = len(dev_data)
for i, row in dev_data.iterrows():
    text = row['text']
    sentiment = row['Freshness'].lower() # convert to lower case
    predicted_sentiment = classify(text, cond_prob)
    if sentiment == predicted_sentiment:
        correct_count += 1

accuracy = correct_count / total_count
print('Accuracy:', accuracy)


Accuracy: 0.7942222222222223


In [90]:
def calculate_cond_prob(alpha):
    # calculate the conditional probability of each word given each sentiment
    cond_prob = {}
    for word in prob_word.keys():
        cond_prob[word] = {}
        for sentiment in sent_count.keys():
            num = sent_count[sentiment][word] + alpha
            denom = sent_count[sentiment]['__total__'] + alpha*len(prob_word)
            cond_prob[word][sentiment] = num / denom
    return cond_prob

#smoothing
alpha_list = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
    cond_prob = calculate_cond_prob(alpha)
    correct_count = 0
    total_count = len(dev_data)
    for i, row in dev_data.iterrows():
        text = row['text']
        sentiment = row['Freshness']
        predicted_sentiment = classify(text, cond_prob)
        if sentiment == predicted_sentiment:
            correct_count += 1

    accuracy = correct_count / total_count
    print('Accuracy (alpha={}): {:.3f}'.format(alpha, accuracy))

Accuracy (alpha=0.0001): 0.793
Accuracy (alpha=0.001): 0.793
Accuracy (alpha=0.01): 0.794
Accuracy (alpha=0.1): 0.794
Accuracy (alpha=1): 0.794
Accuracy (alpha=10): 0.789
Accuracy (alpha=100): 0.766


In [93]:
def top_words(sentiment, num_words=10):
    words = []
    for word in cond_prob.keys():
        if cond_prob[word]['rotten'] > cond_prob[word][sentiment]:
            words.append((word, cond_prob[word][sentiment]))
        elif cond_prob[word]['fresh'] > cond_prob[word][sentiment]:
            words.append((word, cond_prob[word][sentiment]))
    words.sort(key=lambda x: -x[1])
    return words[:num_words]

top_fresh_words = top_words('fresh')
top_rotten_words = top_words('rotten')

print('Top 10 words that predict "fresh":')
for word, prob in top_fresh_words:
    print('{} ({:.3f})'.format(word, prob))
print('\nTop 10 words that predict "rotten":')
for word, prob in top_rotten_words:
    print('{} ({:.3f})'.format(word, prob))

Top 10 words that predict "fresh":
to (0.018)
that (0.011)
it (0.011)
but (0.008)
for (0.007)
this (0.007)
on (0.004)
movie (0.004)
be (0.004)
not (0.004)

Top 10 words that predict "rotten":
the (0.046)
a (0.031)
of (0.025)
and (0.024)
is (0.016)
in (0.012)
its (0.011)
as (0.008)
with (0.007)
film (0.006)
