# Steps
1. Text Preprocessing
2. Feature Engineering-BOW & TFIDF
3. Model Building
4. Evaluatation of model

In [1]:
# import the library
import pandas as pd
import numpy as np
import re

In [2]:
# load the dataset
reviews=pd.read_csv('Zomato_reviews.csv',encoding='latin1')
print('Data loaded')

Data loaded


In [3]:
reviews.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [4]:
reviews.shape

(27762, 2)

In [5]:
reviews.describe(include='all')

Unnamed: 0,rating,review_text
count,27762.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665784,
std,1.284573,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


In [6]:
reviews.isnull().sum()

rating          0
review_text    14
dtype: int64

In [7]:
reviews.dropna(inplace=True)

In [8]:
reviews.isnull().sum()

rating         0
review_text    0
dtype: int64

In [9]:
reviews.reset_index(inplace=True,drop=True)

In [10]:
reviews

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....
...,...,...
27743,4.0,Food quality 4.5/5\r\nHospitality 4/5\r\nManag...
27744,4.0,Taste of the food is good and the ambience as ...
27745,5.0,Pizza is really thin crust and made from fresh...
27746,5.0,"Visited last Saturday with my kids ,\r\nIt was..."


In [11]:
reviews.shape

(27748, 2)

In [12]:
# Convert to list for easy manipulation
reviews_list=reviews.review_text.values

In [13]:
len(reviews_list)

27748

In [14]:
type(reviews_list)

numpy.ndarray

In [15]:
reviews_list[:5]

array(['Their service is worst, pricing in menu is different from bill. They can give you a bill with increased pricing. Even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
       "really appreciate their quality and timing . I have tried the thattil kutti dosa I've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
       'Went there on a Friday night, the place was surprisingly empty. Interesting menu which is almost fully made of dosas. I had bullseye dosa and cheese masala dosa. The bullseye Dosa was really good, with the egg perfectly cooked to a half boiled state. The masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. The chutney was good, the sambar was average. The dishes are reasonably priced.',
       'A very decent place serving good food.\r\nOrdered Chilli fish, Chicken & Pork sizzler.\r\nEverything tasted good but Pork could

# 1.Text Clean up
1. Normalize the case
2. Remove the stop words
     1. remove the words from stop words list- not, no
3. Remove punctuation


In [16]:
reviews_lower=[txt.lower() for txt in reviews_list]

In [17]:
reviews_lower[:3]

['their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . i have tried the thattil kutti dosa i've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
 'went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.']

In [18]:
reviews_lower[:500]

['their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . i have tried the thattil kutti dosa i've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
 'went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly bett

In [19]:
# Remove extra line breaks
reviews_lower=[" ".join(txt.split()) for txt in reviews_lower]

In [20]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better cooked. tried 2 beverages, both were very sweet.']

In [21]:
txt="hello we are \n \n learning NLP \n"
txt.split()

['hello', 'we', 'are', 'learning', 'NLP']

In [22]:
" ".join(txt.split())

'hello we are learning NLP'

Tokenize 


In [23]:
from nltk.tokenize import word_tokenize


In [24]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/labsuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
reviews_tokens=[word_tokenize(txt) for txt in reviews_lower]

In [26]:
print(reviews_tokens[0])

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [27]:
print(reviews_tokens[2])

['went', 'there', 'on', 'a', 'friday', 'night', ',', 'the', 'place', 'was', 'surprisingly', 'empty', '.', 'interesting', 'menu', 'which', 'is', 'almost', 'fully', 'made', 'of', 'dosas', '.', 'i', 'had', 'bullseye', 'dosa', 'and', 'cheese', 'masala', 'dosa', '.', 'the', 'bullseye', 'dosa', 'was', 'really', 'good', ',', 'with', 'the', 'egg', 'perfectly', 'cooked', 'to', 'a', 'half', 'boiled', 'state', '.', 'the', 'masala', 'in', 'the', 'cheese', 'masala', 'was', 'good', ',', 'but', 'the', 'cheese', 'was', 'a', 'bit', 'too', 'chewy', 'for', 'my', 'liking', '.', 'the', 'chutney', 'was', 'good', ',', 'the', 'sambar', 'was', 'average', '.', 'the', 'dishes', 'are', 'reasonably', 'priced', '.']


In [30]:
print(reviews_tokens[:3])

[['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.'], ['really', 'appreciate', 'their', 'quality', 'and', 'timing', '.', 'i', 'have', 'tried', 'the', 'thattil', 'kutti', 'dosa', 'i', "'ve", 'been', 'addicted', 'to', 'the', 'dosa', 'really', 'and', 'the', 'chutney', '...', 'really', 'good', 'and', 'money', 'worth', 'much', 'better', 'than', 'a', 'thattukada', 'must', 'try', 'it'], ['went', 'there', 'on', 'a', 'friday', 'night', ',', 'the', 'place', 'was', 'surprisingly', 'empty', '.', 'interesting', 'menu', 'which', 'is', 'almost', 'fully', 'made', 'of', 'dosas', '.', 'i', 'had', 'bullseye', 'dosa', 'and', 'cheese', 'masala', 'dosa', '.', 'the', 'bullseye', 'dosa', 'was', 'really', 'good', ',', 'w

In [31]:
# Remove stopwords
# punctuations
from nltk.corpus import stopwords
from string import punctuation

In [32]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/labsuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [34]:
stop_nltk=stopwords.words('english')
stop_punct=list(punctuation)

In [35]:
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [36]:
print(stop_punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [37]:
len(stop_punct)

32

In [38]:
stop_nltk.remove('no')
stop_nltk.remove('not')
stop_nltk.remove('don')
stop_nltk.remove('won')

In [39]:
"no" in stop_nltk

False

In [40]:
# final list for removal
stop_final=stop_nltk+stop_punct+['...',"''","====",'must']

In [41]:
# remove the stop final
def del_stop(txt):
    return [term for term in txt if term not in stop_final]

In [42]:
del_stop(reviews_tokens[1])

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [43]:
reviews_clean=[del_stop(txt) for txt in reviews_tokens]

In [44]:
print(reviews_clean[:3])

[['service', 'worst', 'pricing', 'menu', 'different', 'bill', 'give', 'bill', 'increased', 'pricing', 'even', 'serving', 'water', 'menu', 'order', 'need', 'call', '3-4', 'times', 'even', 'non', 'busy', 'day'], ['really', 'appreciate', 'quality', 'timing', 'tried', 'thattil', 'kutti', 'dosa', "'ve", 'addicted', 'dosa', 'really', 'chutney', 'really', 'good', 'money', 'worth', 'much', 'better', 'thattukada', 'try'], ['went', 'friday', 'night', 'place', 'surprisingly', 'empty', 'interesting', 'menu', 'almost', 'fully', 'made', 'dosas', 'bullseye', 'dosa', 'cheese', 'masala', 'dosa', 'bullseye', 'dosa', 'really', 'good', 'egg', 'perfectly', 'cooked', 'half', 'boiled', 'state', 'masala', 'cheese', 'masala', 'good', 'cheese', 'bit', 'chewy', 'liking', 'chutney', 'good', 'sambar', 'average', 'dishes', 'reasonably', 'priced']]


In [45]:
print(reviews_clean[:10])

[['service', 'worst', 'pricing', 'menu', 'different', 'bill', 'give', 'bill', 'increased', 'pricing', 'even', 'serving', 'water', 'menu', 'order', 'need', 'call', '3-4', 'times', 'even', 'non', 'busy', 'day'], ['really', 'appreciate', 'quality', 'timing', 'tried', 'thattil', 'kutti', 'dosa', "'ve", 'addicted', 'dosa', 'really', 'chutney', 'really', 'good', 'money', 'worth', 'much', 'better', 'thattukada', 'try'], ['went', 'friday', 'night', 'place', 'surprisingly', 'empty', 'interesting', 'menu', 'almost', 'fully', 'made', 'dosas', 'bullseye', 'dosa', 'cheese', 'masala', 'dosa', 'bullseye', 'dosa', 'really', 'good', 'egg', 'perfectly', 'cooked', 'half', 'boiled', 'state', 'masala', 'cheese', 'masala', 'good', 'cheese', 'bit', 'chewy', 'liking', 'chutney', 'good', 'sambar', 'average', 'dishes', 'reasonably', 'priced'], ['decent', 'place', 'serving', 'good', 'food', 'ordered', 'chilli', 'fish', 'chicken', 'pork', 'sizzler', 'everything', 'tasted', 'good', 'pork', 'could', 'slightly', 'be

In [46]:
reviews_clean=[" ".join(txt) for txt in reviews_clean]
reviews_clean[:2]

['service worst pricing menu different bill give bill increased pricing even serving water menu order need call 3-4 times even non busy day',
 "really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada try"]

In [48]:
len(reviews_clean)

27748

In [49]:
reviews_clean1=reviews_clean[:5000]

In [50]:
len(reviews_clean1)

5000

In [54]:
# create Indep & dep set
X=reviews_clean1
y=reviews.rating[:5000]

In [55]:
len(X)

5000

In [56]:
len(y)

5000

In [57]:
# split the data into trainset & test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [58]:
X_train

['quantity less',
 'lassi weeks back within hours strong case throat infection would avoid possible may one case cant sure given area',
 'taste good quantity mutton less',
 'dint like food taste neither kebab not pizza biryani better food-1/5 ambiance-4/5 cost-2/5 food taste pathetic service-0/5 person serving got angry asked food not good',
 'came across restaurant wandering across street food joints pubs place definitely draws attention food joint quite well lit beautiful chinese lamps show glass facade quite comfy place sit regular meal friends ordered lohan veg dim sums tasted fine schezwan noodles red chilly ã\x83â\x83â\x83ã\x83â\x82â\x83ã\x83â\x83â\x82ã\x83â\x82â\x82ã\x83â\x83â\x83ã\x83â\x82â\x82ã\x83â\x83â\x82ã\x83â\x82â\x97basil noodles tasted pretty much expected better place staff courteous took care order well ambience 4/5 staff 5/5 food 3/5 variety 4/5',
 "awesome food epic burgers especially loved service owner takes every attention detail every customer comes 'll give pro

In [59]:
# apply tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(max_features=5000)

In [60]:
X_train_features=vectorizer.fit_transform(X_train)

In [61]:
X_test_features=vectorizer.transform(X_test)

In [65]:
len(X_train)

3500

In [64]:
len(X_test)

1500

In [66]:
X_train_features.shape

(3500, 5000)

In [67]:
X_test_features.shape

(1500, 5000)

In [68]:
# Model Building
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor()

In [69]:
%%time
reg.fit(X_train_features,y_train)

CPU times: user 27.1 s, sys: 12 ms, total: 27.1 s
Wall time: 27.1 s


RandomForestRegressor()

In [70]:
y_train_pred=reg.predict(X_train_features)

In [72]:
y_test_pred=reg.predict(X_test_features)

In [74]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_train_pred,y_train)
mse

0.07766932477706141

In [75]:
mse1=mean_squared_error(y_test_pred,y_test)
mse1

0.30244040150563767

In [76]:
# identify performance
res_df=pd.DataFrame({'review':X_test,'rating':y_test,'rating_pred':y_test_pred})

In [77]:
res_df.tail()

Unnamed: 0,review,rating,rating_pred
3563,good food service reasonable rate.manily 's ty...,4.0,4.34
1538,ordered dhoosa puri chai mid night mind blowin...,5.0,4.855
1837,extremely happy order food also good tasty,5.0,5.0
2380,ambience decent service little slow asked wate...,3.0,3.0
1912,best place quick bites pure south indian affor...,5.0,4.965


In [78]:
res_df.head()

Unnamed: 0,review,rating,rating_pred
1501,100 ft boutique restaurant one favourite place...,5.0,4.725
2586,went place today lunch located strategically p...,3.5,3.505
2653,sad people sad people ãââãââãââãââ...,2.0,3.44
1055,super testy food awesome experience fast suppl...,4.5,4.5
705,place great food music ambience fine brewery w...,5.0,4.7


In [79]:
# pickle
# joblib
import joblib
joblib.dump(reg,'Rating_pred.pkl')
print('model saved')

model saved


In [80]:
# load the model 
model=joblib.load('Rating_pred.pkl')
print('model loaded')

model loaded


In [81]:
# Check model on real time data
review='This is one of the best place to vist, very good food & overall service'

In [82]:
vector_review=vectorizer.transform([review])

In [83]:
vector_review

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [84]:
rating_review=model.predict(vector_review)

In [85]:
print('Rating predicted for Given review is--',rating_review)

Rating predicted for Given review is-- [4.445]
