In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import linear_kernel
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import re
%matplotlib inline

In [2]:
df=pd.read_csv('zomato.csv',  encoding = 'ISO-8859-1',  low_memory = False, nrows = 10000)
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          10000 non-null  object
 1   address                      10000 non-null  object
 2   name                         10000 non-null  object
 3   online_order                 10000 non-null  object
 4   book_table                   10000 non-null  object
 5   rate                         8628 non-null   object
 6   votes                        10000 non-null  int64 
 7   phone                        9820 non-null   object
 8   location                     9998 non-null   object
 9   rest_type                    9949 non-null   object
 10  dish_liked                   4450 non-null   object
 11  cuisines                     9990 non-null   object
 12  approx_cost(for two people)  9980 non-null   object
 13  reviews_list                 100

In [4]:
#Removing url and phone column and also the column of dish_liked as we can't afford these many values gone
df.drop(columns=['url','phone','dish_liked'],axis=1,inplace=True)

In [5]:
#Now removing any of the null if possible any
df.dropna(axis=0,inplace=True)

In [6]:
#It's a resturant case so we might wanna start removing any duplicates if possible
df.duplicated().sum()

10

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df['approx_cost(for two people)'] = pd.to_numeric(df['approx_cost(for two people)'],errors = 'coerce')

In [9]:
#Changing the names to ease
df=df.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

In [10]:
df.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [11]:
#Removing 5 from rate
df['rate']=df['rate'].apply(lambda x:x.replace('/5',''))

In [12]:
df['rate'] = pd.to_numeric(df['rate'],errors = 'coerce')

In [13]:
restaurants = list(df['name'].unique())
df['Mean Rating'] = 0

for i in range(len(restaurants)):
    df['Mean Rating'][df['name'] == restaurants[i]] = df['rate'][df['name'] == restaurants[i]].mean()
    
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
df[['Mean Rating']] = scaler.fit_transform(df[['Mean Rating']]).round(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Mean Rating'][df['name'] == restaurants[i]] = df['rate'][df['name'] == restaurants[i]].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Mean Rating'][df['name'] == restaurants[i]] = df['rate'][df['name'] == restaurants[i]].mean()


In [14]:
df['menu_item'].unique()

array(['[]',
       "['Chocolate Fantasy (Pack Of 5)', 'Pan Cake (Pack Of 6)', 'Gulab Jamun (Pack Of 10)', 'Gulkand Shot (Pack Of 5)', 'Chocolate Decadence (Pack of 2)', 'CheeseCake (Pack Of 2)', 'Red Velvet Slice Cake (Pack of 2)', 'Red Velvet Slice Cake & Cheese Cake (Pack of 2)', 'Red Velvet Slice Cake & Chocolate Decadence Cake (Pack of 2)', 'Hazelnut Brownie (Pack of 2)', 'Moments', 'Red Velvet Cake With Butter Cream Frosting (750 Gm)', 'Red Velvet Slice Cake (Pack of 2)', 'Red Velvet Slice Cake & Cheese Cake (Pack of 2)', 'Red Velvet Slice Cake & Chocolate Decadence Cake (Pack of 2)', 'Red Velvet Slice Cake (Pack of 1)', 'Valentine Red Velvet Jar', 'Valentine Chocolate Jar', 'Valentines Jar Combo', 'Pink Guava 500 ML', 'Oreo Vanilla 500 ML', 'Cookie Crumble 500 ML', 'Chocolate Fantasy', 'Gulkand-E-Bahar', 'Pan Cake', 'Hazelnut Brownie (Pack Of 1)', 'Gulab Jamun (Pack Of 2)', 'Plum Cake', 'Red Velvet Cake With Butter Cream Frosting (750 Gm)', 'Chocolate Mud Cake (700 Gms)', 'Chees

In [15]:
df['type'].unique()

array(['Buffet', 'Cafes', 'Delivery', 'Desserts', 'Dine-out',
       'Drinks & nightlife', 'Pubs and bars'], dtype=object)

In [16]:
df['city'].unique()

array(['Banashankari', 'Bannerghatta Road', 'Basavanagudi', 'Bellandur',
       'Brigade Road', 'Brookefield', 'BTM'], dtype=object)

In [17]:
df['location'].unique()

array(['Banashankari', 'Basavanagudi', 'Mysore Road', 'Jayanagar',
       'Kumaraswamy Layout', 'Rajarajeshwari Nagar', 'Vijay Nagar',
       'Uttarahalli', 'JP Nagar', 'South Bangalore', 'City Market',
       'Bannerghatta Road', 'BTM', 'Kanakapura Road', 'Bommanahalli',
       'CV Raman Nagar', 'Electronic City', 'Wilson Garden',
       'Shanti Nagar', 'Koramangala 5th Block', 'Richmond Road', 'HSR',
       'Marathahalli', 'Koramangala 7th Block', 'Bellandur',
       'Sarjapur Road', 'Whitefield', 'East Bangalore',
       'Old Airport Road', 'Indiranagar', 'Koramangala 1st Block',
       'Frazer Town', 'MG Road', 'Brigade Road', 'Lavelle Road',
       'Church Street', 'Ulsoor', 'Residency Road', 'Shivajinagar',
       'Infantry Road', 'St. Marks Road', 'Cunningham Road',
       'Race Course Road', 'Commercial Street', 'Vasanth Nagar', 'Domlur',
       'Koramangala 8th Block', 'Ejipura', 'Jeevan Bhima Nagar',
       'Old Madras Road', 'Seshadripuram', 'Kammanahalli',
       'Koramanga

In [18]:
df['online_order'].unique()

array(['Yes', 'No'], dtype=object)

In [19]:
df['online_order']=df['online_order'].replace(('Yes','No'),(1,0))
df['book_table']=df['book_table'].replace(('Yes','No'),(1,0))

In [20]:
df.drop('address',axis=1,inplace=True)

In [21]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,Jalsa,1,1,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,3.99
1,Spice Elephant,1,0,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,3.97
2,San Churro Cafe,1,0,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari,3.58
3,Addhuri Udupi Bhojana,0,0,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari,3.45
4,Grand Village,0,0,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari,3.58


In [22]:
df['reviews_list'][0]

'[(\'Rated 4.0\', \'RATED\\n  A beautiful place to dine in.The interiors take you back to the Mughal era. The lightings are just perfect.We went there on the occasion of Christmas and so they had only limited items available. But the taste and service was not compromised at all.The only complaint is that the breads could have been better.Would surely like to come here again.\'), (\'Rated 4.0\', \'RATED\\n  I was here for dinner with my family on a weekday. The restaurant was completely empty. Ambience is good with some good old hindi music. Seating arrangement are good too. We ordered masala papad, panner and baby corn starters, lemon and corrionder soup, butter roti, olive and chilli paratha. Food was fresh and good, service is good too. Good for family hangout.\\nCheers\'), (\'Rated 2.0\', \'RATED\\n  Its a restaurant near to Banashankari BDA. Me along with few of my office friends visited to have buffet but unfortunately they only provide veg buffet. On inquiring they said this plac

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8568 entries, 0 to 9997
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          8568 non-null   object 
 1   online_order  8568 non-null   int64  
 2   book_table    8568 non-null   int64  
 3   rate          8015 non-null   float64
 4   votes         8568 non-null   int64  
 5   location      8568 non-null   object 
 6   rest_type     8568 non-null   object 
 7   cuisines      8568 non-null   object 
 8   cost          7477 non-null   float64
 9   reviews_list  8568 non-null   object 
 10  menu_item     8568 non-null   object 
 11  type          8568 non-null   object 
 12  city          8568 non-null   object 
 13  Mean Rating   8137 non-null   float64
dtypes: float64(3), int64(3), object(8)
memory usage: 992.3+ KB


In [24]:
df.dropna(axis=0,inplace=True)

In [25]:
df['reviews_list']=df['reviews_list'].apply(lambda x:x.replace('\\n',''))
df['menu_item']=df['menu_item'].apply(lambda x:x.replace('\\n',''))

In [26]:
df['reviews_list']=df['reviews_list'].str.lower().str.replace('[^\w\s]','')
df['cuisines']=df['cuisines'].str.lower().str.replace('[^\w\s]','')
df['menu_item']=df['menu_item'].str.lower().str.replace('[^\w\s]','')

In [27]:
#Removing stopwords
from nltk.corpus import stopwords

In [28]:
stop=set(stopwords.words('english'))
def remove_stop(text):
    return " ".join([i for i in str(text).split() if i not in stop])

In [29]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
df['reviews_list']=df['reviews_list'].apply(lambda text:remove_stop(text))
df['cuisines']=df['cuisines'].apply(lambda text:remove_stop(text))
df['menu_item']=df['menu_item'].apply(lambda text:remove_stop(text))

In [31]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,Jalsa,1,1,4.1,775,Banashankari,Casual Dining,north indian mughlai chinese,800.0,rated 40 rated beautiful place dine inthe inte...,,Buffet,Banashankari,3.99
1,Spice Elephant,1,0,4.1,787,Banashankari,Casual Dining,chinese north indian thai,800.0,rated 40 rated dinner family turned good choos...,,Buffet,Banashankari,3.97
2,San Churro Cafe,1,0,3.8,918,Banashankari,"Cafe, Casual Dining",cafe mexican italian,800.0,rated 30 rated ambience good enough pocket fri...,,Buffet,Banashankari,3.58
3,Addhuri Udupi Bhojana,0,0,3.7,88,Banashankari,Quick Bites,south indian north indian,300.0,rated 40 rated great food proper karnataka sty...,,Buffet,Banashankari,3.45
4,Grand Village,0,0,3.8,166,Basavanagudi,Casual Dining,north indian rajasthani,600.0,rated 40 rated good restaurant neighbourhood b...,,Buffet,Banashankari,3.58


In [32]:
df['reviews_list'][1]

'rated 40 rated dinner family turned good choose suitable ages people try place liked starters service good prices affordable recommend restaurant early dinner place little noisy rated 30 rated ambience really nice staff courteous price pretty high quantity overall experience fine quality food nice nothing extraordinary also buffetonly veg rated 30 rated felt good little expensive quantity serve terms taste decent nothing much talk ambience regular casual dining restaurant take family dinner lunch improve quantity may reduce price bit may improve presentation food might manage get repeat customers rated 40 rated looking quite place spend time family well wanted try new place since banashankari thought trying place place good rating part zomato gold decided try place delite see friendly staff food ordered tasty wellfood 45ambience 35friendly staff 45pocket friendly 45will definitely visit rated 40 rated nice place dine good ambiance food good serving time also goodneat restrooms well ar

In [33]:
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [34]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,Jalsa,1,1,4.1,775,Banashankari,Casual Dining,north indian mughlai chinese,800.0,rated 40 rated beautiful place dine inthe inte...,,Buffet,Banashankari,3.99
1,Spice Elephant,1,0,4.1,787,Banashankari,Casual Dining,chinese north indian thai,800.0,rated 40 rated dinner family turned good choos...,,Buffet,Banashankari,3.97
2,San Churro Cafe,1,0,3.8,918,Banashankari,"Cafe, Casual Dining",cafe mexican italian,800.0,rated 30 rated ambience good enough pocket fri...,,Buffet,Banashankari,3.58
3,Addhuri Udupi Bhojana,0,0,3.7,88,Banashankari,Quick Bites,south indian north indian,300.0,rated 40 rated great food proper karnataka sty...,,Buffet,Banashankari,3.45
4,Grand Village,0,0,3.8,166,Basavanagudi,Casual Dining,north indian rajasthani,600.0,rated 40 rated good restaurant neighbourhood b...,,Buffet,Banashankari,3.58


In [35]:
df=df.drop(['menu_item', 'votes'],axis=1)

# Randomly sample 60% of your dataframe
df_percent = df.sample(frac=0.5)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [37]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    df_new.index = df_new.index.str.lower()
    return df_new
recommend('The Chariot')

TOP 10 RESTAURANTS LIKE The Chariot WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
shree thali,north indian,3.84,150.0
sree krishna kafe,south indian,3.74,200.0
prasiddhi food corner,fast food north indian south indian,3.45,200.0
swad restaurant,chinese north indian,3.45,550.0
mayura sagar,chinese north indian south indian,3.32,250.0
shanthi sagar,south indian north indian chinese juices,3.32,250.0
vinaya cafe,south indian street food juices,3.19,200.0
funjabi,north indian,3.19,600.0
wazir's,north indian chinese,2.94,500.0
new shanthi sagar,south indian north indian chinese street food,2.87,300.0


In [38]:
df.tail()

Unnamed: 0,name,online_order,book_table,rate,location,rest_type,cuisines,cost,reviews_list,type,city,Mean Rating
9989,Ambur Biryani,0,0,3.5,Koramangala 3rd Block,Quick Bites,biryani chinese,250.0,rated 35 rated hifirst thank served prepared f...,Delivery,BTM,3.19
9990,Panjabi Dhaba,0,0,3.2,Bommanahalli,Quick Bites,north indian chinese,300.0,rated 40 rated food good resonable cost ordere...,Delivery,BTM,2.81
9993,Star Kitchen,0,0,3.3,Bommanahalli,Quick Bites,chinese north indian biryani,150.0,,Delivery,BTM,2.94
9994,Flavorsome Bakes,0,0,4.0,Bannerghatta Road,"Takeaway, Delivery",bakery desserts,800.0,rated 50 rated ordered chocolate ganache drip ...,Delivery,BTM,3.84
9997,Cafe Arabica,0,0,3.8,Bannerghatta Road,"Cafe, Bakery",cafe bakery,700.0,rated 30 rated went dessertits amazing pocket ...,Delivery,BTM,3.58


In [39]:
recommend('hyderabad cuisine multicuisine restaurant')

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
recommend('paakam')

In [None]:
recommend('fast and fresh')