# Yelp reviews classification with NLP 

## 1: Reading in the Yelp Reviews

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [3]:
yelp = pd.read_csv('yelp.csv')

In [4]:
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [5]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

In [6]:
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars

In [7]:
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
X_train

6841    FILLY-B's!!!!!  only 8 reviews?? NINE now!!!\n...
1728    My husband and I absolutely LOVE this restaura...
3853    We went today after lunch. I got my usual of l...
671     Totally dissapointed.  I had purchased a coupo...
4920    Costco Travel - My husband and I recently retu...
                              ...                        
9396    Pros: \n-No breed restrictions on dogs\n-Washe...
2661    Sorry Banana Leaf... I'm usually not picky at ...
9756    Alright this is the deal of deals, 2.75 for st...
554     Hands down a great lil joint! Gotta get the gu...
2575    Absolutely disgusting.  I had enchiladas and a...
Name: text, Length: 3064, dtype: object

## 2: Tokenization

- **What:** Separate text into units such as sentences or words
- **Why:** Gives structure to previously unstructured text
- **Notes:** Relatively easy with English language text, not easy with some languages

In [9]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer()

In [10]:
X_train_dtm = vect.fit_transform(X_train)

In [11]:
X_test_dtm = vect.transform(X_test)

In [12]:
# rows are documents, columns are terms (aka "tokens" or "features")
X_train_dtm.shape

(3064, 16825)

In [13]:
X_test_dtm.shape

(1022, 16825)

In [14]:
# last 50 features
print (vect.get_feature_names()[-100:])

['yorker', 'yorkie', 'you', 'youki', 'young', 'younger', 'youngest', 'your', 'youre', 'yours', 'yourself', 'yourselves', 'youth', 'youthful', 'youtube', 'yow', 'yowza', 'yr', 'yragui', 'yrs', 'yu', 'yuck', 'yucky', 'yuk', 'yukgejang', 'yukon', 'yum', 'yuma', 'yumm', 'yummie', 'yummier', 'yumminess', 'yummm', 'yummmm', 'yummmmmm', 'yummmmmmers', 'yummmmy', 'yummy', 'yumness', 'yung', 'yup', 'yupha', 'yuppies', 'yusefs', 'yuukk', 'yuuuummmmae', 'yuuuuummmmmyyy', 'yuuuuuuum', 'yuyuyummy', 'yuzu', 'yyyyy', 'z11', 'za', 'zabba', 'zach', 'zam', 'zanella', 'zankou', 'zappos', 'zatsiki', 'zen', 'zero', 'zest', 'zexperience', 'zha', 'zhou', 'zia', 'zihuatenejo', 'zilch', 'zin', 'zinburger', 'zinburgergeist', 'zinc', 'zinfandel', 'zing', 'zip', 'zipcar', 'zipper', 'zippers', 'zipps', 'ziti', 'zoe', 'zombi', 'zombies', 'zone', 'zones', 'zoning', 'zoo', 'zoyo', 'zucca', 'zucchini', 'zuchinni', 'zumba', 'zupa', 'zuzu', 'zwiebel', 'zzed', 'éclairs', 'école', 'ém']


In [15]:
X_train

6841    FILLY-B's!!!!!  only 8 reviews?? NINE now!!!\n...
1728    My husband and I absolutely LOVE this restaura...
3853    We went today after lunch. I got my usual of l...
671     Totally dissapointed.  I had purchased a coupo...
4920    Costco Travel - My husband and I recently retu...
                              ...                        
9396    Pros: \n-No breed restrictions on dogs\n-Washe...
2661    Sorry Banana Leaf... I'm usually not picky at ...
9756    Alright this is the deal of deals, 2.75 for st...
554     Hands down a great lil joint! Gotta get the gu...
2575    Absolutely disgusting.  I had enchiladas and a...
Name: text, Length: 3064, dtype: object

In [16]:
demo = pd.DataFrame(X_train_dtm.toarray())
demo.columns = vect.get_feature_names()
demo

Unnamed: 0,00,000,00a,00am,00pm,01,02,03,03342,04,...,zucchini,zuchinni,zumba,zupa,zuzu,zwiebel,zzed,éclairs,école,ém
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3059,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3060,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
demotest = pd.DataFrame(X_test_dtm.toarray())
demotest.columns = vect.get_feature_names()
demotest

Unnamed: 0,00,000,00a,00am,00pm,01,02,03,03342,04,...,zucchini,zuchinni,zumba,zupa,zuzu,zwiebel,zzed,éclairs,école,ém
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- **lowercase:** boolean, True by default
- Convert all characters to lowercase before tokenizing.

In [18]:
# don't convert to lowercase
vect = CountVectorizer(lowercase=True)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(3064, 16825)

- **ngram_range:** tuple (min_n, max_n)
- The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.

In [19]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 3))
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(3064, 456398)

In [20]:
# last 50 features
print (vect.get_feature_names()[-50:])

['zucchini carrots and', 'zucchini fries', 'zucchini fries are', 'zucchini pieces', 'zucchini pieces amongst', 'zucchini strips', 'zucchini strips appetizer', 'zucchini veal', 'zucchini veal demi', 'zucchini very', 'zucchini very good', 'zucchini with', 'zucchini with some', 'zuchinni', 'zuchinni again', 'zuchinni again on', 'zuchinni the', 'zuchinni the sampler', 'zumba', 'zumba class', 'zumba class and', 'zumba or', 'zumba or cycling', 'zumba yogalates', 'zumba yogalates boot', 'zupa', 'zupa flavors', 'zupa flavors are', 'zuzu', 'zuzu in', 'zuzu in downtown', 'zuzu is', 'zuzu is at', 'zuzu the', 'zuzu the ultimate', 'zwiebel', 'zwiebel kräuter', 'zwiebel kräuter salat', 'zzed', 'zzed in', 'zzed in my', 'éclairs', 'éclairs napoleons', 'éclairs napoleons and', 'école', 'école lenôtre', 'école lenôtre trained', 'ém', 'ém all', 'ém all they']


**Predicting the star rating:**

In [21]:
# use default options for CountVectorizer
#vect = CountVectorizer()
vect = CountVectorizer(ngram_range=(1,1))
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
print("Training Accuracy")
print(nb.score(X_train_dtm,y_train))
print("Testing Accuracy")
print(nb.score(X_test_dtm,y_test))

Training Accuracy
0.972911227154047
Testing Accuracy
0.9187866927592955


In [23]:
#With Pipeline and PowerTransformer
from sklearn.pipeline import Pipeline
pipe = Pipeline((
("cv",CountVectorizer(ngram_range=(1,1))),
("mnb", MultinomialNB()),
))
pipe.fit(X_train,y_train)
print("Training Accuracy")
print(pipe.score(X_train,y_train))
print("Testing Accuracy")
print(pipe.score(X_test,y_test))

Training Accuracy
0.972911227154047
Testing Accuracy
0.9187866927592955


In [119]:
X_test

3922    Looking a cutting edge, wanting the best for e...
8379    Greatness in the form of food, just like the o...
4266    The Flower Studio far exceeded my expectations...
5577        So yummy! Strange combination but great place
537     I've been hearing about these cheesecakes from...
                              ...                        
6846    Honey jalapeño chicken lollipops and sweet pot...
1588                   probably my favorite restaurant :)
8451    A philosophical elder of my profession commonl...
7903    First, I'm sorry this review is lengthy, but i...
8255    You speak Italian to me and provide mouth wate...
Name: text, Length: 1022, dtype: object

In [120]:
nb.predict_proba(X_test_dtm)

array([[6.21708191e-13, 1.00000000e+00],
       [2.40125920e-04, 9.99759874e-01],
       [2.04567380e-10, 1.00000000e+00],
       ...,
       [2.86600345e-13, 1.00000000e+00],
       [1.00000000e+00, 1.05200807e-19],
       [8.46839316e-05, 9.99915316e-01]])

In [121]:
nb.predict(X_test_dtm)

array([5, 5, 5, ..., 5, 1, 5], dtype=int64)

In [122]:
predicted = nb.predict(X_test_dtm)
from sklearn.metrics import confusion_matrix,classification_report,f1_score
print(confusion_matrix(y_test,predicted))
print(classification_report(y_test,predicted))

[[126  58]
 [ 25 813]]
              precision    recall  f1-score   support

           1       0.83      0.68      0.75       184
           5       0.93      0.97      0.95       838

    accuracy                           0.92      1022
   macro avg       0.88      0.83      0.85      1022
weighted avg       0.92      0.92      0.92      1022



In [123]:
print(f1_score(y_test,predicted))

0.7522388059701492


In [124]:
f1_score(y_test,predicted,average=None)

array([0.75223881, 0.95143359])

In [37]:
# calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

0.8199608610567515

In [38]:
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print("Training Accuracy")
    print(nb.score(X_train_dtm,y_train))
    print("Testing Accuracy")
    print(nb.score(X_test_dtm,y_test))

In [39]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 1))
tokenize_test(vect)

Features:  16825
Training Accuracy
0.972911227154047
Testing Accuracy
0.9187866927592955


## 3: Stopword Removal

- **What:** Remove common words that will likely appear in any text
- **Why:** They don't tell you much about your text

In [37]:
# show vectorizer options
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

- **stop_words:** string {'english'}, list, or None (default)
- If 'english', a built-in stop word list for English is used.
- If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
- If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.

In [38]:
vect = CountVectorizer(stop_words='english')

In [39]:
# set of stop words
print(vect.get_stop_words())

frozenset({'formerly', 'done', 'whoever', 'toward', 'more', 'much', 'and', 'nothing', 'yet', 'he', 'on', 'only', 'now', 'beside', 'against', 'has', 'the', 'many', 'found', 'bill', 'ourselves', 'yourself', 'over', 'that', 'is', 'almost', 'whence', 'another', 'eleven', 'alone', 'ltd', 'even', 'thereafter', 'give', 'hereby', 'rather', 'should', 'whenever', 'who', 'our', 'six', 'behind', 'being', 'full', 'must', 'nevertheless', 'still', 'three', 'front', 'below', 'hence', 'been', 'fill', 'any', 'can', 'seeming', 'then', 'etc', 'eight', 'empty', 'except', 'beyond', 'anyone', 'first', 'somehow', 'me', 'whereas', 'noone', 'own', 'two', 'their', 'him', 'do', 'however', 'by', 'sixty', 'few', 'mostly', 'else', 'per', 'whereby', 'whole', 'whatever', 'hasnt', 'whom', 'these', 'those', 'thereby', 'con', 'never', 'top', 'keep', 'yourselves', 'could', 'well', 'same', 'into', 'thru', 'call', 'therein', 'this', 'we', 'my', 'anywhere', 'interest', 'couldnt', 'thereupon', 'put', 'out', 'hereupon', 'for',

In [40]:
# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Features:  16528
Training Accuracy
0.9758485639686684
Testing Accuracy
0.9158512720156555


In [41]:
# without stopwords, dtm size
vect = CountVectorizer()
vect.fit_transform(X_train)

<3064x16825 sparse matrix of type '<class 'numpy.int64'>'
	with 237720 stored elements in Compressed Sparse Row format>

In [42]:
# with stopwords, dtm size
vect = CountVectorizer(stop_words='english')
vect.fit_transform(X_train)

<3064x16528 sparse matrix of type '<class 'numpy.int64'>'
	with 143743 stored elements in Compressed Sparse Row format>

In [43]:
my_additional_stop_words = ['abcd']

from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [44]:
# with stopwords, dtm size
vect = CountVectorizer(stop_words=stop_words)
vect.fit_transform(X_train)

<3064x16528 sparse matrix of type '<class 'numpy.int64'>'
	with 143743 stored elements in Compressed Sparse Row format>

In [45]:
# remove updated stop words 
vect = CountVectorizer(stop_words=stop_words)
tokenize_test(vect)

Features:  16528
Training Accuracy
0.9758485639686684
Testing Accuracy
0.9158512720156555


## 4: Other CountVectorizer Options

- **max_features:** int or None, default=None
- If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

In [46]:
# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=1)
tokenize_test(vect)

Features:  1
Training Accuracy
0.8156005221932114
Testing Accuracy
0.8199608610567515


In [47]:
# all 100 features
print (vect.get_feature_names())

['place']


In [48]:
# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 5), max_features=70000)
tokenize_test(vect)

Features:  70000
Training Accuracy
0.9918407310704961
Testing Accuracy
0.9246575342465754


- **min_df:** float in range [0.0, 1.0] or int, default=1
- When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts.

In [49]:
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)

Features:  43957
Training Accuracy
0.9895561357702349
Testing Accuracy
0.9324853228962818


In [50]:
vect = CountVectorizer(ngram_range=(1, 5), min_df=2)
tokenize_test(vect)

Features:  76347
Training Accuracy
0.9924934725848564
Testing Accuracy
0.9246575342465754
