In [1]:
import pandas as pd

# 1. Load the data using read_csv function from pandas package 

In [2]:
data = pd.read_csv("Zomato_reviews.csv")

In [3]:
data

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....
...,...,...
27757,4.0,Food quality 4.5/5\r\nHospitality 4/5\r\nManag...
27758,4.0,Taste of the food is good and the ambience as ...
27759,5.0,Pizza is really thin crust and made from fresh...
27760,5.0,"Visited last Saturday with my kids ,\r\nIt was..."


In [4]:
data.shape

(27762, 2)

# 2. Null values in the review text? 

a. Remove the records where the review text is null 


In [5]:
data1 = data[~data.review_text.isnull()].copy()

In [6]:
data1.reset_index(inplace=True, drop=True)

In [7]:
data_list = data1.review_text.values
len(data_list)

27748

In [8]:
data1.shape

(27748, 2)

# 3. Perform cleanup on the data 

a. Normalize the casing

In [9]:
data_lower = [txt.lower() for txt in data_list]
data_lower

['their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . i have tried the thattil kutti dosa i've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
 'went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly bett

b. Remove extra line breaks from the text 

In [10]:
data_lower = [" ".join(txt.split()) for txt in data_lower]
data_lower

['their service is worst, pricing in menu is different from bill. they can give you a bill with increased pricing. even for serving water,menu, order you need to call them 3-4 times even on a non busy day.',
 "really appreciate their quality and timing . i have tried the thattil kutti dosa i've been addicted to the dosa really and the chutney... really good and money worth much better than a thattukada must try it",
 'went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better coo

c. Remove stop words 

i. Note: Terms like ‘no’, ‘not’, ‘don’, ‘won’ are important, don’t remove

d. Remove punctuation 


In [11]:
import nltk

In [12]:
from nltk.corpus import stopwords

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
from string import punctuation

In [15]:
print(word_tokenize(data_lower[0]))

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [16]:
data_tokens = [word_tokenize(sent) for sent in data_lower]
print(data_tokens[0])

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [17]:
data_nltk = stopwords.words("english")

In [18]:
data_punct = list(punctuation)

In [19]:
print(data_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [20]:
data_nltk.remove("no")
data_nltk.remove("not")
data_nltk.remove("don")
data_nltk.remove("won")

In [21]:
data_final = data_nltk + data_punct + ["...", "``","''", "====", "must"]

In [22]:
def del_stop(sent):
    return [term for term in sent if term not in data_final]

In [23]:
del_stop(data_tokens[1])

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [24]:
data_clean = [del_stop(sent) for sent in data_tokens]

In [25]:
data_clean = [" ".join(sent) for sent in data_clean]
data_clean[:2]

['service worst pricing menu different bill give bill increased pricing even serving water menu order need call 3-4 times even non busy day',
 "really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada try"]

# 4. Separation into train and test sets 

a. Use train-test method to divide your data into 2 sets: train and test 
    
b. Use a 70-30 split 

In [26]:
x = data_clean
y = data1.rating

In [27]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.30, random_state=42)

# 5. Use TF-IDF values for the terms as features to get into a vector space model 


a. Import TF-IDF vectorizer from sklearn 

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

b. Instantiate with a maximum of 5000 terms in your vocabulary

In [29]:
vectorizer = TfidfVectorizer(max_features = 5000)

c. Fit and apply on the train set 

In [30]:
xtrain_bow = vectorizer.fit_transform(xtrain)

d. Apply on the test set 

In [31]:
xtest_bow = vectorizer.transform(xtest)

In [32]:
xtrain_bow.shape, xtest_bow.shape

((19423, 5000), (8325, 5000))

# 6. Model building: Random Forest Regressor
    
a. Instantiate RandomForestRegressor from sklearn (set random seed) 

b. Fit on the train data 

c. Make predictions for the train set 

In [33]:
from sklearn.ensemble import RandomForestRegressor

In [34]:
data_rf = RandomForestRegressor(random_state=42)

In [35]:
%%time
data_rf.fit(xtrain_bow, ytrain)

CPU times: total: 8min 13s
Wall time: 8min 15s


RandomForestRegressor(random_state=42)

In [36]:
ytrain_preds = data_rf.predict(xtrain_bow)

# 7. Model evaluation 

a. Report the root mean square error

In [37]:
from sklearn.metrics import mean_squared_error

In [38]:
mean_squared_error(ytrain, ytrain_preds)**0.5

0.2375118642849858

# 8. Hyperparameter tuning 

a. Import GridSearch 

b. Provide the parameter grid to choose: 
    
i. max_features – ‘auto’, ‘sqrt’, ‘log2’ 

ii. max_depth – 10, 15, 20, 25 

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
df_rf = RandomForestRegressor(random_state=42)

In [41]:
param_grid = {
    'max_features': [500, "auto", "sqrt", "log2"],
    'max_depth': [10, 15, 20, 25]
}

# 9. Find the parameters with the best mean square error in cross-validation 

a. Choose the appropriate scoring as the metric for scoring 

b. Choose stratified 5 fold cross-validation scheme

c. Fit on the train set

In [42]:
grid_search = GridSearchCV(estimator = data_rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )

In [43]:
grid_search.fit(xtrain_bow, ytrain)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20, 25],
                         'max_features': [500, 'auto', 'sqrt', 'log2']},
             scoring='neg_mean_squared_error', verbose=1)

In [44]:
grid_search.cv_results_

{'mean_fit_time': array([ 11.15459623,  91.3312346 ,   1.71292291,   0.77941642,
         23.1795073 , 153.10891075,   3.19543405,   1.24179993,
         34.61757097, 225.60964522,   5.83114238,   1.69084177,
         50.51506863, 232.6390995 ,   7.17068853,   2.09438381]),
 'std_fit_time': array([1.76473488e-01, 8.33019975e-01, 1.34045070e-02, 1.26292281e-02,
        5.89617117e-01, 2.03520979e+00, 2.34160610e-02, 6.55298965e-02,
        5.73035975e-01, 3.19049189e+00, 5.71815255e-01, 1.43318182e-01,
        9.24160475e-01, 1.36316041e+01, 1.26341950e-01, 8.83484026e-02]),
 'mean_score_time': array([0.10531793, 0.1104938 , 0.10026288, 0.0989192 , 0.13397279,
        0.13816075, 0.12679229, 0.12644897, 0.13942623, 0.1423492 ,
        0.16568685, 0.14604101, 0.17226734, 0.11509905, 0.14912829,
        0.14204211]),
 'std_score_time': array([0.00241038, 0.00582454, 0.0039174 , 0.01079986, 0.01329211,
        0.01881378, 0.00778018, 0.01017825, 0.00203464, 0.00625847,
        0.02675152, 

In [45]:
grid_search.cv_results_['params']

[{'max_depth': 10, 'max_features': 500},
 {'max_depth': 10, 'max_features': 'auto'},
 {'max_depth': 10, 'max_features': 'sqrt'},
 {'max_depth': 10, 'max_features': 'log2'},
 {'max_depth': 15, 'max_features': 500},
 {'max_depth': 15, 'max_features': 'auto'},
 {'max_depth': 15, 'max_features': 'sqrt'},
 {'max_depth': 15, 'max_features': 'log2'},
 {'max_depth': 20, 'max_features': 500},
 {'max_depth': 20, 'max_features': 'auto'},
 {'max_depth': 20, 'max_features': 'sqrt'},
 {'max_depth': 20, 'max_features': 'log2'},
 {'max_depth': 25, 'max_features': 500},
 {'max_depth': 25, 'max_features': 'auto'},
 {'max_depth': 25, 'max_features': 'sqrt'},
 {'max_depth': 25, 'max_features': 'log2'}]

In [46]:
best_clif = grid_search.best_estimator_

In [47]:
best_clif

RandomForestRegressor(max_depth=25, max_features=500, random_state=42)

# 11. Predict and evaluate using the best estimator 

a. Use the best estimator from the grid search to make predictions on the test set

b. What is the root mean squared error on the test set? 

In [48]:
ytrain_pred = best_clif.predict(xtrain_bow)

In [49]:
ytest_pred = best_clif.predict(xtest_bow)

In [50]:
mean_squared_error(ytrain, ytrain_pred)**0.5

0.5843599493512933

In [51]:
mean_squared_error(ytest, ytest_pred)**0.5

0.6687389895139219

# 12. Can you identify mismatch cases? 

a. Make a rule based on the predicted value and the actual value that identifies mismatch 
cases (e.g. difference in actual and predicted being more than a cutoff) 

b. How many such cases do you see? 

c. Are all these mismatch cases genuine?

In [52]:
data_res = pd.DataFrame({'review':xtest, 'rating':ytest, 'rating_pred':ytest_pred})

In [53]:
data_res[(data_res.rating - data_res.rating_pred)>=2]

Unnamed: 0,review,rating,rating_pred
7277,life saviours serving excellent food worst tim...,5.0,2.02692
1818,value money ordered second time,5.0,2.865918
4771,not good,5.0,2.004206
16510,may not polished serving packaging etc never b...,5.0,1.787331
14845,oh memories place first drink bangalore almost...,5.0,2.409124
15201,sauce not included,4.0,1.835716
27705,options would improvement long quality not com...,4.0,1.977477
16515,may not polished serving packaging etc never b...,5.0,1.787331
