In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the data set
data_df = pd.read_csv("data/cleaned_data.csv")

In [3]:
data_df.shape

(9373, 15)

In [4]:
data_df.head(5)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost_for_two,reviews_list,menu_item,listed_as,listed_in_city
0,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,thai fish tikka buffet mutton fried lassi curr...,thai chinese north indian,800.0,dinner family turned good choose suitable ages...,seven karepak ki fish safed batti masala and ...,delivery buffet,banashankari
1,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,vegetarian mango masala rice roti garlic dosa ...,north indian south indian,300.0,great proper karnataka style full meals twice ...,vegetarian mango dosa masala rice garlic roti ...,dine-out buffet,banashankari
2,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,gol paratha kulcha panipuri gappe,north indian rajasthani,600.0,good restaurant neighbourhood buffet system pr...,gol paratha kulcha panipuri gappe,buffet,banashankari
3,"19/1, New Timberyard Layout, Beside Satellite ...",Rosewood International Hotel - Bar & Restaurant,No,No,3.6,8,Mysore Road,Casual Dining,honey chicken,andhra chinese south indian north indian,800.0,awesome great servicefriendly staffsgood quali...,honey chicken,buffet,banashankari
4,"12,29 Near PES University Back Gate, D'Souza N...",Caf Down The Alley,Yes,No,4.1,402,Banashankari,Cafe,mango chicken pizza crispy honey sandwich past...,cafe,500.0,ended saturday afternoon hectic day good ambie...,ki crush masala onion blackcurrant hulk alfre...,cafes dine-out,banashankari


In [5]:
data_df.isna().sum()

address             0
name                0
online_order        0
book_table          0
rate                0
votes               0
location            0
rest_type           0
dish_liked        888
cuisines            4
cost_for_two        0
reviews_list      294
menu_item         644
listed_as           0
listed_in_city      0
dtype: int64

In [6]:
data_df['dish_liked'] = data_df['dish_liked'].fillna('')
data_df['cuisines'] = data_df['cuisines'].fillna('')
data_df['reviews_list'] = data_df['reviews_list'].fillna('')
data_df['menu_item'] = data_df['menu_item'].fillna('')

Creating new columns for online_order and book_table to make more sense while calculating similarity.

In [7]:
import numpy as np
data_df['online'] = np.where(data_df['online_order'] == 'Yes', 'Online', '')
data_df['table_booking'] = np.where(data_df['book_table'] == 'Yes', 'TableBooking', '')

We are calculating wighted rating, to balance the difference in number of votes with respect to rating.

\begin{equation} Weighted Rating =(\frac{v}{v+m} * {R}) + (\frac{m}{v+m} * {C}) \end{equation}

v --> the number of votes 
m --> the minimum votes required to be listed in the chart 
R --> the average rating of the restaurent 
C --> the mean vote across the whole

In [8]:
def weighted_rating(data, m, c):
    v = data['votes']
    R = data['rate']
    wr = (v/(v+m) * R) + (m/(m+v) * C)
    return round(wr,1)

In [9]:
# this is V
vote_counts = data_df[data_df['votes'].notnull()]['votes'].astype('int')

# this is R
vote_averages = data_df[data_df['rate'].notnull()]['rate'].astype('int')

# this is C
C = vote_averages.mean()
m = vote_counts.quantile(0.50)
print(C)
print(m)

3.163341512856076
53.0


In [10]:
data_df['weighted_rating'] = data_df.apply(lambda x: weighted_rating(x, m, C), axis =1)

In [11]:
cols = ['location', 'rest_type', 'cuisines', 'dish_liked', 'menu_item', 'cuisines',
        'online', 'table_booking','listed_as', 'listed_in_city']
data_df['combined'] = data_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

Vectorizing the words using CountVectorizer

In [12]:
documents = data_df['combined']
count_vectorizer = CountVectorizer(stop_words='english') # convert all words to lowercase and remove stop words
sparse_matrix = count_vectorizer.fit_transform(documents)

Calculate consine similarity of vectors

In [13]:
similarity_scores = cosine_similarity(sparse_matrix, sparse_matrix)

Creating dataframe for similarity scores

In [14]:
scores_df = pd.DataFrame(similarity_scores )
scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9363,9364,9365,9366,9367,9368,9369,9370,9371,9372
0,1.000000,0.212598,0.157279,0.219421,0.205028,0.242351,0.158491,0.148848,0.270929,0.375895,...,0.118934,0.018230,0.089839,0.252755,0.223323,0.302329,0.419937,0.196168,0.241415,0.186620
1,0.212598,1.000000,0.308248,0.497617,0.144659,0.133588,0.043853,0.087518,0.107694,0.130008,...,0.053791,0.021437,0.274675,0.070767,0.246200,0.081594,0.109737,0.230680,0.385276,0.351123
2,0.157279,0.308248,1.000000,0.387829,0.025367,0.023426,0.028837,0.023020,0.023606,0.045596,...,0.028298,0.000000,0.277885,0.037229,0.172693,0.030661,0.000000,0.191614,0.213352,0.269380
3,0.219421,0.497617,0.387829,1.000000,0.113754,0.105048,0.077589,0.051615,0.105857,0.122679,...,0.088828,0.025286,0.373834,0.250418,0.406562,0.178737,0.129439,0.286417,0.478365,0.724785
4,0.205028,0.144659,0.025367,0.113754,1.000000,0.478224,0.385699,0.518563,0.437594,0.406562,...,0.205845,0.066157,0.013041,0.148507,0.091175,0.122305,0.190879,0.127392,0.087610,0.036120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9368,0.302329,0.081594,0.030661,0.178737,0.122305,0.152808,0.098143,0.065288,0.140595,0.232767,...,0.176566,0.127939,0.157622,0.337873,0.269377,1.000000,0.260477,0.208318,0.226908,0.327424
9369,0.419937,0.109737,0.000000,0.129439,0.190879,0.250190,0.111995,0.111755,0.383905,0.420564,...,0.178590,0.136871,0.134901,0.596413,0.440135,0.260477,1.000000,0.131779,0.297774,0.130773
9370,0.196168,0.230680,0.191614,0.286417,0.127392,0.193764,0.170375,0.081604,0.111575,0.168366,...,0.317661,0.199889,0.279101,0.142969,0.242319,0.208318,0.131779,1.000000,0.220589,0.272833
9371,0.241415,0.385276,0.213352,0.478365,0.087610,0.150251,0.113822,0.022716,0.116468,0.146224,...,0.097733,0.083462,0.301625,0.348991,0.532518,0.226908,0.297774,0.220589,1.000000,0.417705


In [15]:
def recommend_by_name(name,scores_df):
    """ Function to recommed top 10 title based on cosine similarity.
        It takes top 30 matching restaurents and sorts them based on wighted rating in descending.
        We take top 10 unique restaurents from the sorted list to recommend.
    """
    global recommended
    top30 = []
    recommended = []
    name = name.lower()
    index = data_df[data_df['name'].str.lower()==name].index[0]
    cost = min(data_df[data_df['name'].str.lower()==name].cost_for_two)
    top30_list = list(scores_df.iloc[index].sort_values(ascending = False).iloc[1:31].index)
    
    for each in top30_list:
        if (data_df.iloc[each]['cost_for_two'] >= cost-200) & (data_df.iloc[each]['cost_for_two'] <= cost+200 ):
            top30.append(data_df.iloc[each]['name'])
        
    a = [x.lower() for x in top30]
    filtered = data_df[data_df['name'].str.lower().isin(a)]
    filtered_sorted = filtered.sort_values("weighted_rating",ascending=False)
    
    for i in filtered_sorted['name']:
        if recommended.count(i) <= 0:
            recommended.append(i)
        if len(recommended) == 10:
            break

    return recommended

recommend_by_name('Spice Elephant',scores_df)

['The Purple Pan',
 'Golden Rice',
 'Chullah Bhatti',
 'Chowmein',
 'Al Khansah',
 'Tamarind - Tamarind Hospitality',
 'Rustic Stove',
 'Rannaghar',
 'Central Jail Restaurant',
 'Spice Up']

In [16]:
recommend_by_name('Pai Vihar',scores_df)

['Sri Udupi Park',
 'Savi Sagar',
 'Bengaluru Coffee House',
 'Rajathadri Food Fort',
 'Sagar Fast Food',
 'Shanthi Sagar',
 'The Krishna Grand Xpress',
 'Udupi Aatithya',
 'Hotel Chandrika',
 'Sukh Sagar']

In [17]:
a = [x.lower() for x in recommended]
a.append('pai vihar')
filtered = data_df[data_df['name'].str.lower().isin(a)]
filtered

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost_for_two,reviews_list,menu_item,listed_as,listed_in_city,online,table_booking,weighted_rating,combined
79,"4001/4002, Annapoorneshwari Plaza, Near Seetha...",Bengaluru Coffee House,Yes,No,4.1,201,Banashankari,Quick Bites,rava fried masala pongal idli tirupathi kharab...,north indian chinese street food south indian,300.0,best pongal banshankri pongal called tirupathi...,ki kadai masala and onion puri hakka khees fr...,delivery dine-out,banashankari,Online,,3.9,Banashankari Quick Bites north indian chinese ...
172,"24, Subramaniapura Main Road, Uttarahalli, Ban...",Shanthi Sagar,Yes,No,3.1,28,Uttarahalli,Casual Dining,manchurian roti dal,north indian chinese south indian,400.0,slow service give preference service section p...,rava soup butter masala open onion badam chow...,delivery,banashankari,Online,,3.1,Uttarahalli Casual Dining north indian chinese...
290,"30th Cross, 8th Main, Near Jain Temple, 4th Bl...",Sukh Sagar,Yes,No,3.8,155,Jayanagar,Casual Dining,dosa chaat masala noodles rice bhaji roti sand...,north indian chinese street food south indian,450.0,quality quantity good service quick hygenic re...,plate masala and onion puri ganga cold moon t...,delivery,banashankari,Online,,3.6,Jayanagar Casual Dining north indian chinese s...
430,"77/1, 24th Main, 2nd Phase, JP Nagar, Bangalore",Shanthi Sagar,Yes,No,3.6,518,JP Nagar,Casual Dining,paneer masala mushroom dosa coffee palak,north indian chinese south indian,400.0,yummy south indian breakfast shanti sagar qual...,masala and onion puri hakka fried bhara ginge...,delivery,bannerghatta road,Online,,3.6,JP Nagar Casual Dining north indian chinese so...
1352,"Foodcourt, Soul Space Spirit, 5th Floor, Bella...",Savi Sagar,No,No,3.2,6,Bellandur,"Food Court, Quick Bites",noodles roti,chinese south indian north indian,400.0,bad taste waste money triple schezwan noodles ...,noodles roti,delivery,bellandur,,,3.2,"Bellandur Food Court, Quick Bites chinese sout..."
1536,"57, Opposite Galaxy Mall, Residency Road, Bang...",Sri Udupi Park,Yes,No,3.8,106,Residency Road,Quick Bites,idli chaai noodles tea onion kharabath filter...,chinese south indian,200.0,finding good south indian restaurant difficult...,kadai masala and onion hakka fried orange ric...,delivery dine-out,"brigade road, residency road",Online,,3.6,Residency Road Quick Bites chinese south india...
1859,"611, AECS Layout C Block, Kundanahalli, Brooke...",Sri Udupi Park,No,No,3.4,13,Brookefield,Quick Bites,lime butter,north indian chinese south indian,300.0,great breakfast udapi masala dosa vada idali g...,lime butter,delivery,brookefield,,,3.2,Brookefield Quick Bites north indian chinese s...
2095,"611, 60Ft Main Road, AECS Layout, C Block Broo...",Sri Udupi Park,Yes,No,3.9,265,Brookefield,Quick Bites,vegetarian dosa tea masala idli noodles punjab...,north indian chinese south indian,400.0,good veg restaurant brookefield best udupi res...,vegetarian pav tea idli masala noodles punjabi...,dine-out,brookefield,Online,,3.8,Brookefield Quick Bites north indian chinese s...
2165,"314/B, 20th Main, 80 Feet Road, 8th Block, Opp...",Sagar Fast Food,Yes,No,4.0,137,Koramangala 8th Block,Quick Bites,dosa idli neer,north indian street food south indian chinese,250.0,sagar fast one best mangalorean restaurant ban...,kadai masala and onion puri cold fried ginger...,delivery,btm,Online,,3.8,Koramangala 8th Block Quick Bites north indian...
2282,"702, 6th Cross, 3rd Block, Behind BDA Complex,...",Sagar Fast Food,Yes,No,4.0,76,Koramangala 3rd Block,Quick Bites,vegetarian chaat butter tea gajar halwa roti k...,fast food north indian south indian,300.0,really delicious worth super foods quick respo...,masala and onion puri fried fast ginger spani...,delivery,btm,Online,,3.7,Koramangala 3rd Block Quick Bites fast food no...


Creating vectors using TfIdf Vectorizer and calculate the similarity of vectors

In [18]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(data_df['combined'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

Creating dataframe of similarity vectors

In [19]:
scores_df = pd.DataFrame(cosine_similarities)
scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9363,9364,9365,9366,9367,9368,9369,9370,9371,9372
0,1.000000,0.041146,0.026008,0.043650,0.028799,0.027865,0.019111,0.016967,0.040324,0.074944,...,0.011633,0.004796,0.009740,0.029067,0.026119,0.032559,0.093217,0.028536,0.027761,0.016461
1,0.041146,1.000000,0.076325,0.168319,0.030422,0.025675,0.016888,0.017928,0.022353,0.025045,...,0.006138,0.002063,0.036792,0.007037,0.032705,0.008124,0.016336,0.038331,0.063396,0.062232
2,0.026008,0.076325,1.000000,0.092582,0.005922,0.005374,0.006769,0.004829,0.005324,0.008833,...,0.003361,0.000000,0.030418,0.005454,0.015814,0.004106,0.000000,0.021947,0.021438,0.025506
3,0.043650,0.168319,0.092582,1.000000,0.023346,0.015469,0.013394,0.012966,0.019278,0.021888,...,0.014608,0.002639,0.058140,0.052233,0.050051,0.066585,0.014177,0.040339,0.064450,0.238747
4,0.028799,0.030422,0.005922,0.023346,1.000000,0.111462,0.088388,0.157764,0.102683,0.112651,...,0.042404,0.008945,0.001153,0.016267,0.011186,0.010339,0.023378,0.015519,0.006610,0.002020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9368,0.032559,0.008124,0.004106,0.066585,0.010339,0.012908,0.008044,0.005524,0.011899,0.023046,...,0.052957,0.083467,0.113155,0.094700,0.112154,1.000000,0.064550,0.023442,0.047026,0.120115
9369,0.093217,0.016336,0.000000,0.014177,0.023378,0.051992,0.013809,0.016609,0.088430,0.105416,...,0.049801,0.080353,0.102879,0.242113,0.177476,0.064550,1.000000,0.019311,0.046480,0.021344
9370,0.028536,0.038331,0.021947,0.040339,0.015519,0.030737,0.021998,0.009164,0.016329,0.032791,...,0.085268,0.053159,0.084429,0.021615,0.040829,0.023442,0.019311,1.000000,0.033150,0.029804
9371,0.027761,0.063396,0.021438,0.064450,0.006610,0.013092,0.009913,0.002350,0.010313,0.013953,...,0.019958,0.035904,0.078796,0.056886,0.094703,0.047026,0.046480,0.033150,1.000000,0.060668


In [20]:
def recommend_by_name(name,scores_df = scores_df):
    """ Function to recommed top 10 title based on cosine similarity.
        It takes top 30 matching restaurents and sorts them based on wighted rating in descending.
        We take top 10 unique restaurents from the sorted list to recommend.
    """
    global recommended
    recommended = []
    top30 = []
    name = name.lower()
    index = data_df[data_df['name'].str.lower()==name].index[0]
    cost = min(data_df[data_df['name'].str.lower()==name].cost_for_two)
    top30_list = list(scores_df.iloc[index].sort_values(ascending = False).iloc[1:31].index)
    
    for each in top30_list:
        if (data_df.iloc[each]['cost_for_two'] >= cost-200) & (data_df.iloc[each]['cost_for_two'] <= cost+200 ):
            top30.append(data_df.iloc[each]['name'])
        
    a = [x.lower() for x in top30]
    filtered = data_df[data_df['name'].str.lower().isin(a)]
    filtered_sorted = filtered.sort_values("weighted_rating", ascending=False)
    
    for i in filtered_sorted['name']:
        if recommended.count(i) <= 0:
            recommended.append(i)
        if len(recommended) == 10:
            break

    return recommended

recommend_by_name('Spice Elephant',scores_df)

['Shanghai Court',
 'Eshanya',
 "Chung's Chinese Corner",
 'Beijing Bites',
 'Chowmein',
 'Hakka Chinese Restaurant',
 'Oogway Express',
 'Happy Singh',
 'Xian',
 'Golkonda Chimney']

In [21]:
recommend_by_name('Pai Vihar',scores_df)

['Sri Udupi Park',
 'Swadista Aahar',
 'Rajathadri Food Fort',
 'Bengaluru Coffee House',
 'Shanthi Sagar',
 'Sri Lakshmi Vaibhav',
 'The Krishna Grand Xpress',
 'Sagar Fast Food',
 'Hotel Chandrika',
 'The Rasaganga']