In [2]:
import pandas as pd
import numpy as np

# We are using the BernoulliNB version of Naive Bayes, which assumes predictors are binary encoded.
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
rotten = pd.read_csv('d:/data/rt_critics.csv')

The columns fresh contains three classes, namely, "fresh", "rotten" and "none". The third one needs to be removed which can be done using the Python method isin( ) which returns a boolean DataFrame showing whether each element in the DataFrame is contained in values.

In [4]:
rotten = rotten[rotten['fresh'].isin(['fresh','rotten'])]

We now turn the fresh column into 0s and 1s using .map( ).

In [5]:
rotten['fresh'] = rotten['fresh'].map(lambda x: 1 if x == 'fresh' else 0)

We have to choose a range value ngram_range. The latter is:

The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
ngram_range = (1,2)
max_features = 2000

cv = CountVectorizer(ngram_range=ngram_range, max_features=max_features, binary=True, stop_words='english')

The next step is to "learn the vocabulary dictionary and return term-document matrix" using cv.fit_transform

In [7]:
words = cv.fit_transform(rotten.quote)

The dataframe corresponding to this term-document matrix will be called `df_words`. This is our predictor matrix.

The method todense() returns a dense matrix representation of the matrix words.

In [8]:
df_words = pd.DataFrame(words.todense(), columns=cv.get_feature_names())

# In this dataframe:

# Rows are classes
# Columns are features.



In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_words.values, rotten.fresh.values, test_size=0.25)

We will now use BernoulliNB() on the training data to build a model to predict if the class is "fresh" or "rotten" based on the word appearances:

In [10]:
nb = BernoulliNB()
nb.fit(X_train, y_train)

BernoulliNB()

We will now obtain the probability of words given the "fresh" classification. The log probabilities of a feature for given a class is obtained using nb.feature_log_prob_. We then exponentiate the result to get the actual probabilities. To organize our results we build a DataFrame which includes a new column showing the difference in probabilities:

In [11]:
feat_lp = nb.feature_log_prob_
fresh_p = np.exp(feat_lp[1])
rotten_p = np.exp(feat_lp[0])

df_new = pd.DataFrame({'fresh_probability':fresh_p, 
                       'rotten_probability':rotten_p, 
                       'feature':df_words.columns.values})

df_new['probability_diff'] = df_new['fresh_probability'] - df_new['rotten_probability']

In [12]:
nb.score(X_test, y_test)

0.7295758610873897

We now investigate which words are more likely to be found in "fresh" and "rotten" reviews

In [16]:
df_fresh = df_new.sort_values('probability_diff', ascending=False)
df_rotten = df_new.sort_values('probability_diff', ascending=True)
print('Words are more likely to be found in "fresh"')
df_fresh['feature'].tolist()[0:5]

Words are more likely to be found in "fresh"


['film', 'best', 'entertaining', 'great', 'performance']

In [18]:
print('Words are more likely to be found in "rotten"')
df_rotten['feature'].tolist()[0:5]

Words are more likely to be found in "rotten"


['like', 'bad', 'really', 'little', 'isn']

We conclude by find which movies have highest probability of being "fresh" or "rotten"

We need to use the other columns of the original table for that. Defining the target and predictors, fitting the model to all data:

In [19]:
X = df_words.values
y = rotten['fresh']

model = BernoulliNB().fit(X,y)

df_full = pd.DataFrame({
        'probability_fresh':model.predict_proba(X)[:,1],
        'movie':rotten.title,
        'quote':rotten.quote
    })

df_fresh = df_full.sort_values('probability_fresh',ascending=False)
df_rotten = df_full.sort_values('probability_fresh',ascending=True)

In [21]:
print('5 Movies most likely to be fresh:')
df_fresh.head()

5 Movies most likely to be fresh:


Unnamed: 0,probability_fresh,movie,quote
7549,0.99999,Kundun,"Stunning, odd, glorious, calm and sensationall..."
7352,0.999989,Witness,"Powerful, assured, full of beautiful imagery a..."
7188,0.999986,Mrs Brown,Centering on a lesser-known chapter in the rei...
5610,0.999978,Diva,"The most exciting debut in years, it is unifie..."
4735,0.999977,Sophie's Choice,"Though it's far from a flawless movie, Sophie'..."


In [22]:
print('5 Movies most likely to be rotten:')
df_rotten.head()

5 Movies most likely to be rotten:


Unnamed: 0,probability_fresh,movie,quote
12567,1.2e-05,Pokémon: The First Movie,"With intentionally stilted animation, uninspir..."
3546,1.3e-05,Joe's Apartment,There's not enough story here for something ha...
2112,6.2e-05,The Beverly Hillbillies,Imagine the dumbest half-hour sitcom you've ev...
3521,9.7e-05,Kazaam,"As fairy tale, buddy comedy, family drama, thr..."
6837,0.000138,Batman & Robin,"Pointless, plodding plotting; asinine action; ..."
