In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the data set
data_df = pd.read_csv("data/cleaned_data.csv")

In [3]:
data_df.shape

(9373, 15)

In [4]:
data_df.head(5)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,dish_reviewed,cuisines,cost_for_two,reviews_list,menu_item,listed_as,listed_in_city
0,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,lunch curry momos biryani nirvana buffet thai ...,chinese north indian thai,800.0,dinner family turned good choose suitable ages...,gobi stir lahori chilli firdausi mushroom bro...,delivery buffet,banashankari
1,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,dosa masala,north indian south indian,300.0,great proper karnataka style full meals twice ...,masala dosa,dine-out buffet,banashankari
2,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,gol gappe panipuri,north indian rajasthani,600.0,good restaurant neighbourhood buffet system pr...,gol gappe panipuri,buffet,banashankari
3,"19/1, New Timberyard Layout, Beside Satellite ...",Rosewood International Hotel - Bar & Restaurant,No,No,3.6,8,Mysore Road,Casual Dining,,north indian chinese andhra south indian,800.0,awesome great servicefriendly staffsgood quali...,,buffet,banashankari
4,"12,29 Near PES University Back Gate, D'Souza N...",Caf Down The Alley,Yes,No,4.1,402,Banashankari,Cafe,chilli honey pasta crispy sandwich chicken cre...,cafe,500.0,ended saturday afternoon hectic day good ambie...,chilli mushroom brownie fries pakoda fresh pe...,cafes dine-out,banashankari


In [5]:
data_df.isna().sum()

address              0
name                 0
online_order         0
book_table           0
rate                 0
votes                0
location             0
rest_type            0
dish_reviewed     4761
cuisines             4
cost_for_two         0
reviews_list       294
menu_item         3021
listed_as            0
listed_in_city       0
dtype: int64

In [6]:
data_df['dish_reviewed'] = data_df['dish_reviewed'].fillna('')
data_df['cuisines'] = data_df['cuisines'].fillna('')
data_df['reviews_list'] = data_df['reviews_list'].fillna('')
data_df['menu_item'] = data_df['menu_item'].fillna('')

Creating new columns for online_order and book_table to make more sense while calculating similarity.

In [7]:
import numpy as np
data_df['online'] = np.where(data_df['online_order'] == 'Yes', 'Online', '')
data_df['table_booking'] = np.where(data_df['book_table'] == 'Yes', 'TableBooking', '')

We are calculating wighted rating, to balance the difference in number of votes with respect to rating.

\begin{equation} Weighted Rating =(\frac{v}{v+m} * {R}) + (\frac{m}{v+m} * {C}) \end{equation}

v --> the number of votes 
m --> the minimum votes required to be listed in the chart 
R --> the average rating of the restaurent 
C --> the mean vote across the whole

In [8]:
def weighted_rating(data, m, c):
    v = data['votes']
    R = data['rate']
    wr = (v/(v+m) * R) + (m/(m+v) * C)
    return round(wr,1)

In [9]:
# this is V
vote_counts = data_df[data_df['votes'].notnull()]['votes'].astype('int')

# this is R
vote_averages = data_df[data_df['rate'].notnull()]['rate'].astype('int')

# this is C
C = vote_averages.mean()
m = vote_counts.quantile(0.50)
print(C)
print(m)

3.163341512856076
53.0


In [10]:
data_df['weighted_rating'] = data_df.apply(lambda x: weighted_rating(x, m, C), axis =1)

In [11]:
cols = ['name', 'location', 'rest_type', 'rest_type', 'rest_type', 'cuisines', 'dish_reviewed', 'menu_item',
        'online', 'table_booking','listed_as', 'listed_in_city']
data_df['combined'] = data_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

Vectorizing the words using CountVectorizer

In [12]:
documents = data_df['combined']
count_vectorizer = CountVectorizer(stop_words='english') # convert all words to lowercase and remove stop words
sparse_matrix = count_vectorizer.fit_transform(documents)

Calculate consine similarity of vectors

In [13]:
similarity_scores = cosine_similarity(sparse_matrix, sparse_matrix)

Creating dataframe for similarity scores

In [14]:
scores_df = pd.DataFrame(similarity_scores )
scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9363,9364,9365,9366,9367,9368,9369,9370,9371,9372
0,1.000000,0.094141,0.213352,0.240834,0.181956,0.202981,0.091049,0.135173,0.254513,0.345770,...,0.131942,0.016912,0.167137,0.213352,0.038158,0.307613,0.335609,0.239236,0.031068,0.187084
1,0.094141,1.000000,0.152008,0.237584,0.086426,0.082639,0.074138,0.064205,0.069080,0.075801,...,0.018801,0.024098,0.095265,0.025335,0.598100,0.047818,0.012925,0.113633,0.649309,0.123040
2,0.213352,0.152008,1.000000,0.630670,0.025649,0.024526,0.038504,0.022230,0.023918,0.022496,...,0.351525,0.025031,0.494771,0.473684,0.056478,0.298020,0.000000,0.432789,0.061314,0.447315
3,0.240834,0.237584,0.630670,1.000000,0.026726,0.038333,0.040121,0.034745,0.037383,0.046881,...,0.386630,0.130410,0.644424,0.548408,0.147122,0.362284,0.041967,0.471454,0.159719,0.688040
4,0.181956,0.086426,0.025649,0.026726,1.000000,0.490044,0.431587,0.530855,0.454596,0.422084,...,0.085656,0.036596,0.012056,0.012825,0.013762,0.080687,0.157027,0.047935,0.014940,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9368,0.307613,0.047818,0.298020,0.362284,0.080687,0.100297,0.000000,0.048952,0.097814,0.183995,...,0.319458,0.157485,0.404672,0.430473,0.142134,1.000000,0.160488,0.358928,0.057864,0.455648
9369,0.335609,0.012925,0.000000,0.041967,0.157027,0.206452,0.019644,0.102072,0.323363,0.372997,...,0.099632,0.166013,0.138830,0.281936,0.187288,0.160488,1.000000,0.070253,0.078201,0.065202
9370,0.239236,0.113633,0.432789,0.471454,0.047935,0.064169,0.014392,0.000000,0.026820,0.092493,...,0.379572,0.187120,0.462329,0.413117,0.105550,0.358928,0.070253,1.000000,0.091670,0.429928
9371,0.031068,0.649309,0.061314,0.159719,0.014940,0.028571,0.000000,0.000000,0.013932,0.039311,...,0.068252,0.116642,0.144098,0.122628,0.789542,0.057864,0.078201,0.091670,1.000000,0.148888


In [15]:
def get_recommendations(name,scores_df):
    """ Function to recommed top 10 title based on cosine similarity.
        It takes top 30 matching restaurents and sorts them based on wighted rating in descending.
        We take top 10 unique restaurents from the sorted list to recommend.
    """
    global recommended
    top30 = []
    recommended = []
    name = name.lower()
    index = data_df[data_df['name'].str.lower()==name].index[0]
    cost = min(data_df[data_df['name'].str.lower()==name].cost_for_two)
    top30_list = list(scores_df.iloc[index].sort_values(ascending = False).iloc[1:31].index)
    
    for each in top30_list:
        if (data_df.iloc[each]['cost_for_two'] >= cost-400) & (data_df.iloc[each]['cost_for_two'] <= cost+400 ):
            top30.append(data_df.iloc[each]['name'])
        
    a = [x.lower() for x in top30]
    filtered = data_df[data_df['name'].str.lower().isin(a)]
    filtered_sorted = filtered.sort_values("weighted_rating",ascending=False)
    
    for i in filtered_sorted['name']:
        if recommended.count(i) <= 0:
            recommended.append(i)
        if len(recommended) == 10:
            break

    return recommended

get_recommendations('Spice Elephant',scores_df)

['Eshanya',
 'The Purple Pan',
 'Golden Rice',
 'Delicacy',
 'Chaarcoal',
 'Chullah Bhatti',
 'Chowmein',
 'Rivayat',
 'Al Khansah',
 'Tamarind - Tamarind Hospitality']

In [16]:
get_recommendations('Pai Vihar',scores_df)

['Sri Udupi Park',
 'Bengaluru Coffee House',
 'Shanthi Sagar',
 'Kalpavriksha Upahara',
 'The Krishna Grand Xpress',
 'Sagar Fast Food',
 'Srinidhi Sagar',
 'Hotel Chandrika',
 'Udupi Aatithya',
 'The Rasaganga']

In [17]:
a = [x.lower() for x in recommended]
a.append('pai vihar')
filtered = data_df[data_df['name'].str.lower().isin(a)]
filtered

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,dish_reviewed,cuisines,cost_for_two,reviews_list,menu_item,listed_as,listed_in_city,online,table_booking,weighted_rating,combined
79,"4001/4002, Annapoorneshwari Plaza, Near Seetha...",Bengaluru Coffee House,Yes,No,4.1,201,Banashankari,Quick Bites,pongal kali kharabath rava dosa filter vada id...,north indian street food chinese south indian,300.0,best pongal banshankri pongal called tirupathi...,gobi do mushroom raita dinger fries pakoda po...,delivery dine-out,banashankari,Online,,3.9,Bengaluru Coffee House Banashankari Quick Bite...
172,"24, Subramaniapura Main Road, Uttarahalli, Ban...",Shanthi Sagar,Yes,No,3.1,28,Uttarahalli,Casual Dining,,north indian chinese south indian,400.0,slow service give preference service section p...,tea parotta bajji milk bournvita rice tattle ...,delivery,banashankari,Online,,3.1,Shanthi Sagar Uttarahalli Casual Dining Casual...
430,"77/1, 24th Main, 2nd Phase, JP Nagar, Bangalore",Shanthi Sagar,Yes,No,3.6,518,JP Nagar,Casual Dining,mushroom dosa paneer masala coffee palak,north indian chinese south indian,400.0,yummy south indian breakfast shanti sagar qual...,gobi do peking mushroom raita watermelon nort...,delivery,bannerghatta road,Online,,3.6,Shanthi Sagar JP Nagar Casual Dining Casual Di...
1536,"57, Opposite Galaxy Mall, Residency Road, Bang...",Sri Udupi Park,Yes,No,3.8,106,Residency Road,Quick Bites,noodles filter vada idli coffee kharabath,chinese south indian,200.0,finding good south indian restaurant difficult...,gobi bisi mushroom muskmelon ragi pakoda poor...,delivery dine-out,"residency road, brigade road",Online,,3.6,Sri Udupi Park Residency Road Quick Bites Quic...
1859,"611, AECS Layout C Block, Kundanahalli, Brooke...",Sri Udupi Park,No,No,3.4,13,Brookefield,Quick Bites,,north indian chinese south indian,300.0,great breakfast udapi masala dosa vada idali g...,,delivery,brookefield,,,3.2,Sri Udupi Park Brookefield Quick Bites Quick B...
2095,"611, 60Ft Main Road, AECS Layout, C Block Broo...",Sri Udupi Park,Yes,No,3.9,265,Brookefield,Quick Bites,tea naan bhaji dosa masala coffee pav,north indian chinese south indian,400.0,good veg restaurant brookefield best udupi res...,tea bhaji pav dosa masala coffee naan,dine-out,brookefield,Online,,3.8,Sri Udupi Park Brookefield Quick Bites Quick B...
2165,"314/B, 20th Main, 80 Feet Road, 8th Block, Opp...",Sagar Fast Food,Yes,No,4.0,137,Koramangala 8th Block,Quick Bites,neer dosa,north indian street food chinese south indian,250.0,sagar fast one best mangalorean restaurant ban...,gobi do mushroom ragi poori fresh watermelon ...,delivery,btm,Online,,3.8,Sagar Fast Food Koramangala 8th Block Quick Bi...
2199,"57, 12th Main Road, 6th Sector, Behind BDA Com...",The Rasaganga,Yes,No,3.9,163,HSR,Quick Bites,vada dahi chaat coffee,north indian street food chinese south indian,300.0,one finest veg restaurant chain city bangalore...,gobi do chilli mushroom raita muskmelon ragi ...,delivery,btm,Online,,3.7,The Rasaganga HSR Quick Bites Quick Bites Quic...
2282,"702, 6th Cross, 3rd Block, Behind BDA Complex,...",Sagar Fast Food,Yes,No,4.0,76,Koramangala 3rd Block,Quick Bites,tea thali cheese ka gajar halwa dosa coffee ne...,north indian fast food south indian,300.0,really delicious worth super foods quick respo...,gobi do peking chilli mushroom raita ragi pak...,delivery,btm,Online,,3.7,Sagar Fast Food Koramangala 3rd Block Quick Bi...
2601,"148/2, 20th Main Road, 2nd Cross, 1st Stage",Sagar Fast Food,Yes,No,3.7,66,BTM,Quick Bites,dosa idli masala lassi sweet,chinese south indian north indian,300.0,one best eat breakfast masala dosa idli awesom...,dosa idli masala lassi sweet,delivery,btm,Online,,3.5,Sagar Fast Food BTM Quick Bites Quick Bites Qu...


Creating vectors using TfIdf Vectorizer and calculate the similarity of vectors

In [18]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(data_df['combined'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

Creating dataframe of similarity vectors

In [19]:
scores_df = pd.DataFrame(cosine_similarities)
scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9363,9364,9365,9366,9367,9368,9369,9370,9371,9372
0,1.000000,0.032067,0.040423,0.056655,0.023804,0.027243,0.013263,0.016890,0.041361,0.066299,...,0.017518,0.004738,0.024693,0.037875,0.005900,0.038774,0.076001,0.036411,0.003004,0.028146
1,0.032067,1.000000,0.076868,0.122563,0.021862,0.021010,0.027959,0.017044,0.019062,0.019974,...,0.001692,0.002285,0.012329,0.003016,0.120256,0.005241,0.001350,0.025557,0.130471,0.021660
2,0.040423,0.076868,1.000000,0.182780,0.006517,0.006263,0.010129,0.005362,0.005997,0.005954,...,0.055803,0.011939,0.087227,0.099464,0.006657,0.051142,0.000000,0.077207,0.007222,0.078256
3,0.056655,0.122563,0.182780,1.000000,0.008925,0.009871,0.013872,0.008341,0.009452,0.010614,...,0.078212,0.041134,0.160367,0.142948,0.019846,0.082231,0.004602,0.107053,0.042381,0.223865
4,0.023804,0.021862,0.006517,0.008925,1.000000,0.130117,0.112894,0.150077,0.123495,0.105777,...,0.013033,0.004776,0.000995,0.001033,0.001247,0.007435,0.023008,0.003106,0.001353,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9368,0.038774,0.005241,0.051142,0.082231,0.007435,0.009337,0.000000,0.004550,0.010736,0.021062,...,0.097061,0.082746,0.148930,0.173002,0.120951,1.000000,0.055291,0.059438,0.026585,0.109996
9369,0.076001,0.001350,0.000000,0.004602,0.023008,0.035984,0.002647,0.017493,0.066195,0.099858,...,0.052332,0.075395,0.088305,0.138733,0.127308,0.055291,1.000000,0.013752,0.026929,0.015838
9370,0.036411,0.025557,0.077207,0.107053,0.003106,0.012529,0.002968,0.000000,0.002895,0.013285,...,0.082679,0.077725,0.126030,0.100084,0.034301,0.059438,0.013752,1.000000,0.030149,0.075815
9371,0.003004,0.130471,0.007222,0.042381,0.001353,0.002840,0.000000,0.000000,0.001474,0.004164,...,0.029008,0.047902,0.058482,0.055920,0.239386,0.026585,0.026929,0.030149,1.000000,0.032979


In [20]:
def get_recommendations(name,scores_df = scores_df):
    """ Function to recommed top 10 title based on cosine similarity.
        It takes top 30 matching restaurents and sorts them based on wighted rating in descending.
        We take top 10 unique restaurents from the sorted list to recommend.
    """
    global recommended
    recommended = []
    top30 = []
    name = name.lower()
    index = data_df[data_df['name'].str.lower()==name].index[0]
    cost = min(data_df[data_df['name'].str.lower()==name].cost_for_two)
    top30_list = list(scores_df.iloc[index].sort_values(ascending = False).iloc[1:31].index)
    
    for each in top30_list:
        if (data_df.iloc[each]['cost_for_two'] >= cost-400) & (data_df.iloc[each]['cost_for_two'] <= cost+400 ):
            top30.append(data_df.iloc[each]['name'])
        
    a = [x.lower() for x in top30]
    filtered = data_df[data_df['name'].str.lower().isin(a)]
    filtered_sorted = filtered.sort_values("weighted_rating", ascending=False)
    
    for i in filtered_sorted['name']:
        if recommended.count(i) <= 0:
            recommended.append(i)
        if len(recommended) == 10:
            break

    return recommended

get_recommendations('Spice Elephant',scores_df)

['Eshanya',
 'Shanghai Court',
 "Chung's Chinese Corner",
 'Delicacy',
 'The Purple Pan',
 'Beijing Bites',
 'East Delicious',
 'Chowmein',
 'Popsies',
 'Hongkong Garden']

In [21]:
get_recommendations('Pai Vihar',scores_df)

['Sri Udupi Park',
 'Bengaluru Coffee House',
 'Sagar Fast Food',
 'Kalpavriksha Upahara',
 'The Krishna Grand Xpress',
 'Sri Lakshmi Vaibhav',
 'Shanthi Sagar',
 'Sukh Sagar',
 'Hotel Chandrika',
 'Srinidhi Sagar']

In [22]:
get_recommendations('Rosewood International Hotel - Bar & Restaurant',scores_df)

['Bridgeway',
 'Abhiruchi Restaurant',
 'Utsav Restaurant',
 'Nisarga',
 'Andhra Bhojanam',
 'Blue Wings Bar & Restaurant',
 'Cafe Monarch Luxur',
 'Dine One One Restaurant',
 'Chillies Restaurant',
 'Triveni']

In [23]:
get_recommendations('Fujian Express',scores_df)

['Szechuan Dragon',
 'Delicacy',
 'Beijing Bites',
 "Chung's Chinese Corner",
 'Popsies',
 'Mountain Spice',
 'Fujian Express',
 "Kenny's Restaurant",
 'Anu Momos Chinese Kitchen',
 'Mr Manchurian']

In [24]:
get_recommendations('San Churro Cafe',scores_df)

['Onesta',
 'Tea Villa Cafe',
 'Eurasia Pasta and Barbeque by Little Italy',
 'Grubhouse',
 '@Italy',
 'Cafe Cassia& Deli',
 'Burp',
 'THE BEL ST.',
 'Sunny Cafe & Kitchen',
 'Crisp Cafe']

In [25]:
get_recommendations('Woodee Pizza',scores_df)

["Domino's Pizza",
 'Ovenstory Pizza',
 'Pizza Hut',
 'Cafe Coffee Day',
 'Cuppa',
 'Hide Out Cafe',
 'Cafe Hush',
 'The Studio Cafe',
 'Butterly Cafe',
 'CAFE NOVA']

In [26]:
get_recommendations('Behrouz Biryani',scores_df)

["Mani's Dum Biryani",
 'Behrouz Biryani',
 "Paul's Dum Biryani",
 'The Good Bowl',
 'Fat Chef Biryani Wale',
 'Biryani Treats',
 'S P Dum Biryani',
 'LSD Cafe']