In [148]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Read data into DataFrame

In [33]:
# Read in data from csv
data = pd.read_csv('/Users/swagbird/Desktop/Data Science/Learning/Tutorials/Machine Learning in Python with scikit-learn/pycon-2016-tutorial-master/data/yelp.csv')

In [34]:
# Examine first 5 rows
data.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [35]:
# Examine shape of the dataset
data.shape

(10000, 10)

In [36]:
# Check for empty cells
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id    10000 non-null object
date           10000 non-null object
review_id      10000 non-null object
stars          10000 non-null int64
text           10000 non-null object
type           10000 non-null object
user_id        10000 non-null object
cool           10000 non-null int64
useful         10000 non-null int64
funny          10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.3+ KB


# Create DataFrame with only 5-star and 1-star reviews

Everything looks good to go, let's create a new DataFrame that contains only the 5-star and 1-star reviews:

In [37]:
# Filter for 5-star and 1-star reviews
filtered_data = data[(data.stars==5) | (data.stars==1)]

In [38]:
# Check shape
filtered_data.shape

(4086, 10)

In [None]:
# Check value counts for 5-star and 1-star reviews
filtered_data.stars.value_counts()

# Define X and y and split into training/testing sets

In [40]:
# Store the feature (X) and response variable (y)
X = filtered_data.text
y = filtered_data.stars
print(X.shape)
print(y.shape)

(4086,)
(4086,)


In [129]:
# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3064,)
(1022,)
(3064,)
(1022,)


# Create document-term matrices

In [130]:
# Instantiate the CountVectorizer
vect = CountVectorizer()

In [131]:
# Learn training data vocabulary and use to create document-term matrix
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<3064x16800 sparse matrix of type '<class 'numpy.int64'>'
	with 239821 stored elements in Compressed Sparse Row format>

In [132]:
# Transform testing data into document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1022x16800 sparse matrix of type '<class 'numpy.int64'>'
	with 74872 stored elements in Compressed Sparse Row format>

# Use Naive Bayes to predict star rating

Now that we have our training and testing document-term matrices, we can use the multinomial Naive Bayes model to predict the star rating for the reviews in the testing set:

In [133]:
# Instantiate the Multinomial Naive Bayes model
nb = MultinomialNB()

In [134]:
# train the model using X_train_dtm
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [135]:
# Make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [136]:
# Calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred_class)

0.9090019569471625

In [137]:
# Print confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[119,  76],
       [ 17, 810]])

# Calculate null accuracy

In [138]:
# Calculate the null accuracy
print(y_test.value_counts())
y_test.value_counts().head() / len(y_test)

5    827
1    195
Name: stars, dtype: int64


5    0.809198
1    0.190802
Name: stars, dtype: float64

# Evaluate incorrect classifications

So our model improved on the null accuracy by about 10% from 80.9% to 90.9%. This is pretty good, but can we do better? Let's take a look at the false positives and false negatives to try to find out why the model is incorrectly classifying these reviews.

In [139]:
# Print false positives
X_test[y_pred_class > y_test].head(10)

5237    My love affair with Fate ended when Johnny lef...
6229    Forget the yogurt and the berry berry bad serv...
5854    Not recommended if you're not white. For me, i...
6921    I never knew green chilies were more valuable ...
9640    Went here one night with a group of friends. T...
3469    "The office space, layout, presentation gets 5...
136     Absolutely horrendous.  This post office will ...
6656    I took my kitty to the Beat the Heat low cost ...
4630    I used to always go here for tires until my me...
8943    Went to Sol for restaurant week.  Was less tha...
Name: text, dtype: object

In [141]:
# Print false negatives
X_test[y_pred_class < y_test].head(10)

7903    First, I'm sorry this review is lengthy, but i...
3075                       Unfortunately Out of Business.
5223    Brought my car here b/c of the reviews I read ...
3149    I was told to see Greg after a local shop diag...
5332    I had a great experience.  Nice people.   My m...
8434    a weekend of debauchery was spent here when i ...
5736    Thank goodness for Sue at Mill Avenue Travel. ...
1477    I' m psycho for this place.  The sell cupcakes...
241     I was sad to come back to lai lai's and they n...
750     This store has the most pleasant employees of ...
Name: text, dtype: object

Scikit-learn has defined a 5-star rating as the "positive class". There are positive words in negative reviews and negative words in positive reviews that the model is reacting to.

In [145]:
# Calculate predicted probabilities for X_test_dtm
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([0.99999998, 1.        , 1.        , ..., 0.99863113, 1.        ,
       0.99998711])

In [155]:
# Calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9696152295910457

# Compare model with Logistic Regression

In [159]:
# Compare model with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_dtm, y_train)
y_lr_pred_class = logreg.predict(X_test_dtm)



In [160]:
# Calculate accuracy score
metrics.accuracy_score(y_test, y_lr_pred_class)

0.9354207436399217

In [165]:
# Calculate predicted probabilities
y_lr_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_lr_pred_prob

array([0.99603375, 0.9982664 , 0.99126462, ..., 0.98996551, 1.        ,
       0.9972302 ])

In [166]:
# Calculate AUC
metrics.roc_auc_score(y_test, y_lr_pred_prob)

0.9696152295910457

# Calculate most predictive tokens

In [171]:
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

16800

In [174]:
# Examine the first 50 and last 50 tokens
print(X_train_tokens[0:50])
print(X_train_tokens[-50:])

['00', '000', '00am', '00pm', '01', '02', '03', '03342', '05', '06', '07', '09', '0l', '10', '100', '1000', '1000x', '1001', '100th', '101', '102', '1030', '105', '1070', '108', '10am', '10min', '10mins', '10minutes', '10pm', '10th', '10x', '10yo', '11', '110', '1100', '111', '111th', '112', '115', '115th', '116', '118', '11a', '11am', '11pm', '12', '120', '128i', '129']
['yuzu', 'yyyeeaahhhh', 'z11', 'za', 'zach', 'zam', 'zanella', 'zankou', 'zappos', 'zatsiki', 'zen', 'zero', 'zest', 'zesty', 'zexperience', 'zha', 'zhou', 'zia', 'zichini', 'zihuatenejo', 'zilch', 'zillion', 'zin', 'zinburger', 'zinburgergeist', 'zinc', 'zinfandel', 'zing', 'zip', 'zipcar', 'ziploc', 'zipper', 'zipps', 'zoe', 'zombies', 'zone', 'zones', 'zoo', 'zucca', 'zucchini', 'zuccini', 'zuchinni', 'zumba', 'zupa', 'zupas', 'zuzu', 'zuzus', 'zwiebel', 'zzed', 'ém']


In [180]:
# Find number of times each token appears across all 1-star reviews
one_token_count = nb.feature_count_[0, :]
one_token_count

array([36.,  4.,  3., ...,  0.,  0.,  0.])

In [181]:
# Find number of times each token appears across all 5-star reviews
five_token_count = nb.feature_count_[1, :]
five_token_count

array([39., 10.,  0., ...,  1.,  1.,  1.])

In [184]:
# Create a DataFrame of tokens with their separate counts
tokens = pd.DataFrame({'token':X_train_tokens, '1-star':one_token_count,
                       '5-star':five_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,1-star,5-star
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,36.0,39.0
000,4.0,10.0
00am,3.0,0.0
00pm,1.0,6.0
01,0.0,1.0


In [189]:
# Examine 5 random DataFrame rows
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,1-star,5-star
token,Unnamed: 1_level_1,Unnamed: 2_level_1
trigger,0.0,1.0
tweeter,1.0,0.0
sophie,0.0,1.0
average,14.0,27.0
inspirational,0.0,1.0


In [190]:
# Count number of observations
nb.class_count_

array([ 554., 2510.])

In [191]:
# add 1 to both counts to avoid dividing by 0
tokens['1-star'] += 1
tokens['5-star'] += 1
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,1-star,5-star
token,Unnamed: 1_level_1,Unnamed: 2_level_1
trigger,1.0,2.0
tweeter,2.0,1.0
sophie,1.0,2.0
average,15.0,28.0
inspirational,1.0,2.0


In [192]:
# convert 1-star and 5-star counts into frequencies
tokens['1-star'] = tokens['1-star'] / nb.class_count_[0]
tokens['5-star'] = tokens['5-star'] / nb.class_count_[1]
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,1-star,5-star
token,Unnamed: 1_level_1,Unnamed: 2_level_1
trigger,0.001805,0.000797
tweeter,0.00361,0.000398
sophie,0.001805,0.000797
average,0.027076,0.011155
inspirational,0.001805,0.000797


In [203]:
# Calculate the ratio of 1-star to 5-star for each token
tokens['5-to-1_ratio'] = tokens['5-star'] / tokens['1-star']
tokens.sample(5, random_state=3)

Unnamed: 0_level_0,1-star,5-star,5_to_1_ratio,5-to-1_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
trigger,0.001805,0.000797,0.441434,0.441434
tweeter,0.00361,0.000398,0.110359,0.110359
sophie,0.001805,0.000797,0.441434,0.441434
average,0.027076,0.011155,0.412005,0.412005
inspirational,0.001805,0.000797,0.441434,0.441434


In [210]:
# Find top 10 5-star tokens
tokens.sort_values('5-to-1_ratio', ascending = False).head(10)

Unnamed: 0_level_0,1-star,5-star,5-to-1_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fantastic,0.00361,0.082072,22.733865
perfect,0.00722,0.093625,12.967131
flavors,0.00361,0.042231,11.698008
outstanding,0.001805,0.021116,11.698008
yum,0.001805,0.020717,11.477291
favorite,0.012635,0.128287,10.152988
ribs,0.001805,0.017928,9.932271
gluten,0.001805,0.01753,9.711554
mozzarella,0.001805,0.016733,9.27012
bianco,0.001805,0.016733,9.27012


In [212]:
# Find top 10 1-star tokens
tokens.sort_values('5-to-1_ratio', ascending = True).head(10)

Unnamed: 0_level_0,1-star,5-star,5-to-1_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
disgusting,0.054152,0.000398,0.007357
remove,0.019856,0.000398,0.020065
unprofessional,0.018051,0.000398,0.022072
rude,0.104693,0.00239,0.022833
inedible,0.016245,0.000398,0.024524
flag,0.016245,0.000398,0.024524
pointing,0.016245,0.000398,0.024524
horrible,0.128159,0.003187,0.02487
refused,0.030686,0.000797,0.025967
fuse,0.01444,0.000398,0.02759


# Repeat as a 5-class classification problem

In [217]:
data.stars.value_counts()

4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64

In [220]:
# Store the feature (X) and response variable (y)
X = data.text
y = data.stars
print(X.shape)
print(y.shape)

(10000,)
(10000,)


In [221]:
# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7500,)
(2500,)
(7500,)
(2500,)


In [222]:
# Instantiate CountVectorizer
vect = CountVectorizer()

In [224]:
# Create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<7500x25731 sparse matrix of type '<class 'numpy.int64'>'
	with 621641 stored elements in Compressed Sparse Row format>

In [225]:
# Transform testing data into document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<2500x25731 sparse matrix of type '<class 'numpy.int64'>'
	with 201708 stored elements in Compressed Sparse Row format>

In [227]:
# Instantiate Multinomial Naive Bayes model
nb = MultinomialNB()

In [228]:
# Train model using X_train_dtm
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [231]:
# Make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
y_pred_class

array([4, 5, 4, ..., 5, 4, 4])

In [232]:
# Calculate accuracy of class predictions
metrics.accuracy_score(y_test, y_pred_class)

0.4828

In [244]:
# Calculate null accuracy
print(y_test.value_counts())
y_test.value_counts().head(1) / len(y_test)

4    890
5    820
3    379
2    220
1    191
Name: stars, dtype: int64


4    0.356
Name: stars, dtype: float64

Looks like our model improved on the null accuracy score by approximately 12%. This may seem pretty good, but it still fails to predict the proper review score even HALF the time.

In [233]:
# Print confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 60,  27,  16,  67,  21],
       [ 18,  19,  42, 126,  15],
       [  3,  10,  37, 297,  32],
       [  4,   3,  21, 668, 194],
       [  4,   1,   7, 385, 423]])

Every single prediction was not even close to 100% correct

In [251]:
print(metrics.classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

           1       0.67      0.31      0.43       191
           2       0.32      0.09      0.14       220
           3       0.30      0.10      0.15       379
           4       0.43      0.75      0.55       890
           5       0.62      0.52      0.56       820

   micro avg       0.48      0.48      0.48      2500
   macro avg       0.47      0.35      0.36      2500
weighted avg       0.48      0.48      0.45      2500

