## Yelp star rating prediction:

In [86]:
# importing necessary libraries...
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# for splitting the data...
from sklearn.model_selection import train_test_split 

# for modelling...
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# for evaluate the model...
from sklearn.metrics import accuracy_score

# for ignoring warnings...
import warnings 
warnings.filterwarnings('ignore')

In [14]:
# load the data yelp reviews...
yelp = pd.read_csv('yelp.csv')
yelp.head(2)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0


In [6]:
# extracting the best review and worst review...
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

In [7]:
# defining the x and y
x = yelp_best_worst.text
y = yelp_best_worst.stars

In [17]:
# spliting the data for modelling purpose...
trainx , testx , trainy , testy = train_test_split(x,y,random_state=10)

In [18]:
# checking shape of the splitted data...
trainx.shape, testx.shape , trainy.shape , testy.shape

((3064,), (1022,), (3064,), (1022,))

In [22]:
# use count vectoriser technique to create the dtm for train and test data...
vect = CountVectorizer()
trainx_dtm = vect.fit_transform(trainx)
testx_dtm = vect.transform(testx)

In [24]:
# checking the last 50 features names...
vect.get_feature_names_out()[-50:]

array(['yung', 'yup', 'yur', 'yuri', 'yusefs', 'yuukk', 'yuuuummmmae',
       'yuuuuuuum', 'yyyyy', 'z11', 'za', 'zabba', 'zach', 'zam',
       'zatsiki', 'zen', 'zero', 'zesty', 'zha', 'zia', 'zichini',
       'zihuatenejo', 'zilch', 'zin', 'zinburger', 'zinburgergeist',
       'zinc', 'zinfandel', 'zing', 'zip', 'zipcar', 'zippers', 'zipps',
       'zoe', 'zombies', 'zone', 'zones', 'zoning', 'zoo', 'zoom',
       'zucca', 'zucchini', 'zuccini', 'zuchinni', 'zumba', 'zupa',
       'zupas', 'zuzu', 'zuzus', 'zzed'], dtype=object)

In [25]:
trainx_dtm.shape

(3064, 16757)

In [26]:
# use the same technique but lets take one hyperparameter lowercase = false to not to convert to lowercase...
vect = CountVectorizer(lowercase=False)
trainx_dtm_NL = vect.fit_transform(trainx)
testx_dtm_NL = vect.transform(testx)

In [27]:
trainx_dtm_NL.shape

(3064, 20717)

In [30]:
# lets include 1-ngrams and 2 ngrams...
vect = CountVectorizer(ngram_range=(1,2))
trainx_dtm_ngm = vect.fit_transform(trainx)
testx_dtm_ngm = vect.transform(testx)

In [31]:
trainx_dtm_ngm.shape

(3064, 169121)

In [32]:
vect.get_feature_names_out()[-50:]

array(['zone of', 'zone out', 'zones', 'zones dolls', 'zoning',
       'zoning issues', 'zoo', 'zoo and', 'zoo if', 'zoo is', 'zoo the',
       'zoo tour', 'zoo ve', 'zoom', 'zoom in', 'zucca',
       'zucca appetizer', 'zucchini', 'zucchini and', 'zucchini bread',
       'zucchini broccoli', 'zucchini carrots', 'zucchini fires',
       'zucchini fries', 'zucchini pieces', 'zucchini strips',
       'zucchini veal', 'zucchini very', 'zucchini we', 'zucchini with',
       'zuccini', 'zuccini italian', 'zuchinni', 'zuchinni the', 'zumba',
       'zumba or', 'zumba yogalates', 'zupa', 'zupa flavors', 'zupas',
       'zupas cater', 'zuzu', 'zuzu in', 'zuzu is', 'zuzu the',
       'zuzu was', 'zuzus', 'zuzus room', 'zzed', 'zzed in'], dtype=object)

In [35]:
# use naive bayes to predict the star rating and evaluate the models accuracy...
model = MultinomialNB()
model.fit(trainx_dtm_ngm,trainy)
pred_class = model.predict(testx_dtm_ngm)
accuracy_score(testy,pred_class)

0.8434442270058709

In [36]:
# predicted stars...
pred_class

array([5, 5, 5, ..., 5, 5, 5], dtype=int64)

In [38]:
# calculate null accuracy...
y_test_binary = np.where(testy==5, 1, 0)
max(y_test_binary.mean(), 1 - y_test_binary.mean())

0.8140900195694716

In [39]:
# lets create a function which transform the data into dtm and do modelling and give the accuracy...
def tokenise_test(vect):
    trainx_dtm = vect.fit_transform(trainx)
    print('Features:',trainx_dtm.shape[1])
    testx_dtm = vect.transform(testx)
    nb = MultinomialNB()
    nb.fit(trainx_dtm,trainy)
    pred_class = nb.predict(testx_dtm)
    print('Accuracy:',accuracy_score(testy,pred_class))

In [40]:
# include 1-grams and 1-grams
tokenise_test(CountVectorizer(ngram_range=(1,1)))

Features: 16757
Accuracy: 0.9324853228962818


In [41]:
# include 1-grams and 2-grams
tokenise_test(CountVectorizer(ngram_range=(1,2)))

Features: 169121
Accuracy: 0.8434442270058709


In [42]:
# include 1-grams and 1-grams and lower = false...
tokenise_test(CountVectorizer(ngram_range=(1,1),lowercase=False))

Features: 20717
Accuracy: 0.9197651663405088


In [43]:
# remove stopwords...
vect = CountVectorizer(stop_words='english')

In [47]:
# set of stop words...
print(vect.get_stop_words())

frozenset({'none', 'them', 'ever', 'twelve', 'very', 'someone', 'become', 'almost', 'without', 'am', 'any', 'take', 'same', 'de', 'must', 'keep', 'amount', 'beforehand', 'against', 'thence', 'former', 'no', 'everyone', 'own', 'done', 'may', 'already', 'ours', 'in', 'down', 'there', 'thick', 'sixty', 'whither', 'many', 'nothing', 'seem', 'those', 'whereby', 'because', 'ten', 'whereas', 'since', 'were', 'enough', 'has', 'hereby', 'towards', 'me', 'with', 'who', 'formerly', 'beside', 'is', 'go', 'only', 'much', 'front', 'his', 'do', 'noone', 'anyone', 'whether', 'anyway', 'through', 'yourself', 'if', 'thereupon', 'yet', 'nine', 'i', 'hasnt', 'either', 'whence', 'across', 'hers', 'indeed', 'hundred', 'over', 'about', 'was', 'had', 'thereafter', 'too', 'elsewhere', 'hereafter', 'give', 'nevertheless', 'a', 'whole', 'even', 'but', 'fill', 'yours', 'anything', 'its', 'onto', 'five', 'forty', 'everything', 'upon', 'besides', 'interest', 'wherein', 'get', 'herein', 'seeming', 'on', 'here', 'min

In [48]:
# use hyperparameter stop_words='english'
tokenise_test(CountVectorizer(stop_words='english'))

Features: 16460
Accuracy: 0.9217221135029354


In [50]:
# without hyperparameter stop_words...
tokenise_test(CountVectorizer())

Features: 16757
Accuracy: 0.9324853228962818


In [51]:
# use all three hyperparameter which we did earlier...
tokenise_test(CountVectorizer(stop_words='english',ngram_range=(1,2),lowercase=False))

Features: 165232
Accuracy: 0.8405088062622309


In [52]:
# lets increase the set of stop words...
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS.union(['abcd'])

In [53]:
# check the model at updated stopwords...
tokenise_test(CountVectorizer(stop_words=list(stop_words)))

Features: 16460
Accuracy: 0.9217221135029354


In [54]:
# remove English stop words and only keep 100 features(max_features=100)
tokenise_test(CountVectorizer(stop_words=list(stop_words),max_features=100))

Features: 100
Accuracy: 0.87279843444227


In [57]:
tokenise_test(CountVectorizer(stop_words=list(stop_words),max_features=1000))

Features: 1000
Accuracy: 0.9119373776908023


In [59]:
tokenise_test(CountVectorizer(stop_words='english',max_features=500))

Features: 500
Accuracy: 0.8913894324853229


In [60]:
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
tokenise_test(CountVectorizer(stop_words=list(stop_words),max_df=2))

Features: 10394
Accuracy: 0.8003913894324853


In [61]:
tokenise_test(CountVectorizer(stop_words=list(stop_words),min_df=4))

Features: 4896
Accuracy: 0.9266144814090019


In [67]:
tokenise_test(CountVectorizer(stop_words=list(stop_words),min_df=6))

Features: 3554
Accuracy: 0.9217221135029354


In [68]:
### Let's use tf-idf vectoriser...

In [69]:
# use tf-idf with stop words...
tokenise_test(TfidfVectorizer(stop_words='english'))

Features: 16460
Accuracy: 0.8140900195694716


In [70]:
# lets use the updated stopwords which we created earlier...
tokenise_test(TfidfVectorizer(stop_words=list(stop_words)))

Features: 16460
Accuracy: 0.8140900195694716


### Adding Features to a Document-Term Matrix

In [74]:
# defining x and y and split the data into train and test...
x = yelp_best_worst[['text','cool','useful','funny']]
y = yelp_best_worst['stars']

xtrain , xtest , ytrain , ytest = train_test_split(x,y,random_state=1)

In [75]:
# check the shape of the train and test data....
xtrain.shape,xtest.shape,ytrain.shape,ytest.shape

((3064, 4), (1022, 4), (3064,), (1022,))

In [76]:
import scipy as sp

In [80]:
# use countvectoriser for text column only...
vect = CountVectorizer()
xtrain_dtm = vect.fit_transform(xtrain.text)
xtest_dtm = vect.transform(xtest.text)
print(xtrain_dtm.shape)
print(xtest_dtm.shape)

(3064, 16825)
(1022, 16825)


In [79]:
# cast other feature columns to float and convert to a sparse matrix
extra = sp.sparse.csr_matrix(xtrain.drop('text',axis=1)).astype(float)
extra.shape

(3064, 3)

In [82]:
# combine sparse matrices
xtrain_dtm_extra = sp.sparse.hstack((xtrain_dtm,extra))
xtrain_dtm_extra.shape

(3064, 16828)

In [85]:
# repeat for testing set
extra_test = sp.sparse.csr_matrix(xtest.drop('text',axis=1)).astype(float)
xtest_dtm_extra_test = sp.sparse.hstack((xtest_dtm,extra_test))
xtest_dtm_extra_test.shape

(1022, 16828)

In [87]:
# use logistic regression with text column only

logreg = LogisticRegression()
logreg.fit(xtrain_dtm,ytrain)
pred_class_lg = logreg.predict(xtest_dtm)
accuracy_score(ytest,pred_class_lg)

0.9256360078277887

In [92]:
# use logistic regression with all features

logreg = LogisticRegression()
logreg.fit(xtrain_dtm_extra, ytrain)
pred_class_all_lg = logreg.predict(xtest_dtm_extra_test)
print(accuracy_score(ytest, pred_class_all_lg))

0.9305283757338552
