<a href="https://colab.research.google.com/github/Nhung-Nguyen86/DataScience/blob/main/Yelp_Review_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Use Case: Yelp Review Analysis

In [None]:
!pip install textblob

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline



In [None]:
yelp = pd.read_csv(r"yelp.csv")
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,1/26/2011,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,7/27/2011,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,6/14/2012,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,5/27/2010,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,1/5/2012,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [None]:
yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  10000 non-null  object
 1   date         10000 non-null  object
 2   review_id    10000 non-null  object
 3   stars        10000 non-null  int64 
 4   text         10000 non-null  object
 5   type         10000 non-null  object
 6   user_id      10000 non-null  object
 7   cool         10000 non-null  int64 
 8   useful       10000 non-null  int64 
 9   funny        10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [None]:
yelp.describe()

Unnamed: 0,stars,cool,useful,funny
count,10000.0,10000.0,10000.0,10000.0
mean,3.7775,0.8768,1.4093,0.7013
std,1.214636,2.067861,2.336647,1.907942
min,1.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,1.0,0.0
75%,5.0,1.0,2.0,1.0
max,5.0,77.0,76.0,57.0


In [None]:
mean_by_group = yelp.groupby('stars').mean()
mean_by_group

Unnamed: 0_level_0,cool,useful,funny
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.576769,1.604806,1.056075
2,0.719525,1.563107,0.875944
3,0.788501,1.306639,0.69473
4,0.954623,1.395916,0.670448
5,0.944261,1.38178,0.608631


In [None]:
mean_by_group.corr()

Unnamed: 0,cool,useful,funny
cool,1.0,-0.743329,-0.944939
useful,-0.743329,1.0,0.894506
funny,-0.944939,0.894506,1.0


In [None]:
#create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

yelp_best_worst.reset_index(drop=True, inplace=True) # reset the indices. And instead of
# creating another data frame, let's just do it inplace

x = yelp_best_worst.text #reviews
y = yelp_best_worst.stars #ratings
# print x to look at x
# print y to take a look at
print (x.shape)

#split into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1)

(4086,)


In [None]:
print (x)

In [None]:
x[0]

### Tokenization

In [None]:
# use CountVetorizer to create document-term matrices from x_train and x_test
vect = CountVectorizer()
#Tokenize the documents and count the occurrences of token and
#return them as a sparse matrix
x_train_dtm = vect.fit_transform(x_train) # learn the vocabulary dictionary ad create term document matrix
print (x_train_dtm)
#print (x_train_dtm.shape)
x_test_dtm= vect.transform(x_test)
#print x_test_dtm
#x_test_dtm.shape

  (0, 5773)	1
  (0, 10362)	2
  (0, 12465)	1
  (0, 10069)	1
  (0, 10180)	1
  (0, 16612)	2
  (0, 4631)	1
  (0, 9578)	1
  (0, 15093)	1
  (0, 11186)	1
  (0, 136)	1
  (0, 4809)	1
  (0, 15136)	1
  (0, 10413)	2
  (0, 16195)	1
  (0, 15834)	1
  (0, 12514)	2
  (0, 2789)	1
  (0, 14838)	1
  (0, 10286)	2
  (0, 3679)	1
  (0, 15032)	2
  (0, 1018)	1
  (0, 2286)	2
  (0, 1003)	1
  :	:
  (3063, 2312)	1
  (3063, 9318)	1
  (3063, 879)	1
  (3063, 10352)	2
  (3063, 15968)	1
  (3063, 7181)	1
  (3063, 15042)	1
  (3063, 5333)	1
  (3063, 8189)	2
  (3063, 1548)	1
  (3063, 9807)	1
  (3063, 2818)	1
  (3063, 2735)	1
  (3063, 14836)	1
  (3063, 6718)	1
  (3063, 16599)	1
  (3063, 6974)	1
  (3063, 14137)	1
  (3063, 5139)	1
  (3063, 4538)	1
  (3063, 10805)	1
  (3063, 14994)	1
  (3063, 9438)	1
  (3063, 16162)	1
  (3063, 6616)	1


In [None]:
print (x_test)

1607    Looking a cutting edge, wanting the best for e...
3409    Greatness in the form of food, just like the o...
1751    The Flower Studio far exceeded my expectations...
2275        So yummy! Strange combination but great place
230     I've been hearing about these cheesecakes from...
                              ...                        
2793    Honey jalapeño chicken lollipops and sweet pot...
671                    probably my favorite restaurant :)
3441    A philosophical elder of my profession commonl...
3224    First, I'm sorry this review is lengthy, but i...
3362    You speak Italian to me and provide mouth wate...
Name: text, Length: 1022, dtype: object


In [None]:
tf = pd.DataFrame(x_train_dtm.toarray(), columns=vect.get_feature_names())
tf.head()



Unnamed: 0,00,000,00a,00am,00pm,01,02,03,03342,04,...,zucchini,zuchinni,zumba,zupa,zuzu,zwiebel,zzed,éclairs,école,ém
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
x_train.head()

2790    FILLY-B's!!!!!  only 8 reviews?? NINE now!!!\n...
725     My husband and I absolutely LOVE this restaura...
1578    We went today after lunch. I got my usual of l...
282     Totally dissapointed.  I had purchased a coupo...
2024    Costco Travel - My husband and I recently retu...
Name: text, dtype: object

In [None]:
#don't lowercase
vect = CountVectorizer(lowercase=False)
x_train_dtm = vect.fit_transform(x_train)
x_train_dtm.shape

(3064, 20838)

In [None]:
# include 1-grams and 2-grams (an n-gram is N-grams is just all combinations of adjacent words
# or letters of length n that you can find in your source text)
vect = CountVectorizer(ngram_range=(1,2))
x_train_dtm = vect.fit_transform(x_train)
x_train_dtm.shape

(3064, 169847)

In [None]:
print (vect.get_feature_names()[-50:]) # The last 50 words

['zone out', 'zone when', 'zones', 'zones dolls', 'zoning', 'zoning issues', 'zoo', 'zoo and', 'zoo is', 'zoo not', 'zoo the', 'zoo ve', 'zoyo', 'zoyo for', 'zucca', 'zucca appetizer', 'zucchini', 'zucchini and', 'zucchini bread', 'zucchini broccoli', 'zucchini carrots', 'zucchini fries', 'zucchini pieces', 'zucchini strips', 'zucchini veal', 'zucchini very', 'zucchini with', 'zuchinni', 'zuchinni again', 'zuchinni the', 'zumba', 'zumba class', 'zumba or', 'zumba yogalates', 'zupa', 'zupa flavors', 'zuzu', 'zuzu in', 'zuzu is', 'zuzu the', 'zwiebel', 'zwiebel kräuter', 'zzed', 'zzed in', 'éclairs', 'éclairs napoleons', 'école', 'école lenôtre', 'ém', 'ém all']




### Predict the star rating

In [None]:
vect = CountVectorizer()

x_train_dtm = vect.fit_transform(x_train)
x_test_dtm = vect.transform(x_test)

# Questions?
# what is the difference between f
#fit() : is used to generate learning model parameters from training data
#transform() : parameters generated from fit() method,applied upon model to
# generate transformed data set.
# fit_transform() : combines fit() and transform() api on same data sets

#Naive Bayes
nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)
y_pred_class = nb.predict(x_test_dtm)

print (metrics.accuracy_score(y_test, y_pred_class))

0.9187866927592955


### Calculate null accuracy
y_test_binary = np.where(y_test==5, 1, 0)
max(y_test_binary.mean(), 1-y_test_binary.mean())

In [None]:
#define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    x_train_dtm = vect.fit_transform(x_train)
    print ('Features: ', x_train_dtm.shape[1])
    x_test_dtm = vect.transform(x_test)
    nb = MultinomialNB()
    nb.fit(x_train_dtm, y_train)
    y_pred_class = nb.predict(x_test_dtm)
    print ('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

In [None]:
vect = CountVectorizer()
tokenize_test(vect)

Features:  16825
Accuracy:  0.9187866927592955


### Stopword removal
- Stopwords are used words that are widely used in a language.
- To find the list of stopwords in a given language, one can easily type
    - import nltk
    - from nltk.corpus import stopwords
    - set(stopwords.words('the language = 'english', ;french'))

In [None]:
#remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

In [None]:
# set of stop words
print (vect.get_stop_words())

In [None]:
#max_features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)

In [None]:
print(vect.get_feature_names())

In [None]:
#From here
vect = CountVectorizer(ngram_range=(1,2), max_features=100000)
tokenize_test(vect)

In [None]:
#min_df sets the minimum document frequency allowed when creating vocab
vect = CountVectorizer(ngram_range=(1,2), min_df=2)
tokenize_test(vect)

### TextBlob

In [None]:
print (yelp_best_worst.text[0])

In [None]:
review = TextBlob(yelp_best_worst.text[0])

In [None]:
review.words

In [None]:
review.sentences

In [None]:
review.lower()

### Stemming and lemmatization

In [None]:
stemmer = SnowballStemmer('english')
print ([stemmer.stem(word) for word in review.words])

In [None]:
print ([word.lemmatize() for word in review.words])

In [None]:
#assume every word is a verb
print ([word.lemmatize(pos='v') for word in review.words])

In [None]:
def split_into_lemmas(text):
    text = text.lower()
    words = TextBlob(text).words
    #return [word.lemmatize() for word in words]
    return [stemmer.stem(word) for word in words]

In [None]:
#split review text into lemmas rather than into words (default)
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)

In [None]:
print (vect.get_feature_names()[-50:])

### Sentiment Analysis

- Aims to sense people's mood based on the text they write.
- Can be done when the text is quantifiable.
- Sentiment can be positive or negative.

In [None]:
print (review)

In [None]:
max_i = 0
max_polarity = -float('inf')

min_i = 0
min_polarity = float('inf')

for i in range(len(yelp_best_worst.text)):
    review_text = str(yelp_best_worst.text[i])
    this_polarity = TextBlob(review_text).sentiment.polarity

    if this_polarity > max_polarity:
        max_i = i
        max_polarity = this_polarity

    if this_polarity < min_polarity:
        min_i = i
        min_polarity = this_polarity

print (TextBlob(yelp_best_worst.text[max_i]))
print (TextBlob(yelp_best_worst.text[min_i]))

In [None]:
#polarity ranges from -1 (most negative) to 1 (most positive)
print(review.sentiment.polarity)
print(max_polarity)
print(min_polarity)

In [None]:
#understanding the apply method
yelp['length'] = yelp.text.apply(len)

In [None]:
yelp.head(10)

In [None]:
#define a function that accepts text and returns polarity
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [None]:
#create a new DataFrame column for sentiment
yelp['sentiment'] = yelp.text.apply(detect_sentiment)

In [None]:
yelp.boxplot(column='sentiment', by='stars')