In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the data set
data_df = pd.read_csv("data/cleaned_data.csv")

In [3]:
data_df = data_df.drop(["Unnamed: 0"], axis = 1)
data_df.head(5)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost_for_two,reviews_list,listed_as,listed_in_city
0,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,dinner family turned good choose suitable ages...,"Buffet, Dine-out, Delivery","Banashankari, Basavanagudi"
1,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,great proper karnataka style full meals twice ...,"Buffet, Dine-out",Banashankari
2,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,good restaurant neighbourhood buffet system pr...,"Buffet, Dine-out","Banashankari, Basavanagudi"
3,"19/1, New Timberyard Layout, Beside Satellite ...",Rosewood International Hotel - Bar & Restaurant,No,No,3.6,8,Mysore Road,Casual Dining,"North Indian, South Indian, Andhra, Chinese",800.0,awesome great servicefriendly staffsgood quali...,"Buffet, Dine-out, Delivery",Banashankari
4,"12,29 Near PES University Back Gate, D'Souza N...",Caf Down The Alley,Yes,No,4.1,402,Banashankari,Cafe,Cafe,500.0,ended saturday afternoon hectic day nthe good ...,"Dine-out, Cafes, Delivery",Banashankari


\begin{equation} Weighted Rating =(\frac{v}{v+m} * {R}) + (\frac{m}{v+m} * {C}) \end{equation}

v --> the number of votes 
m --> the minimum votes required to be listed in the chart 
R --> the average rating of the restaurent 
C --> the mean vote across the whole

In [4]:
def weighted_rating(data, m, c):
    v = data['votes']
    R = data['rate']
    wr = (v/(v+m) * R) + (m/(m+v) * C)
    return round(wr,1)

In [5]:
# this is V
vote_counts = data_df[data_df['votes'].notnull()]['votes'].astype('int')

# this is R
vote_averages = data_df[data_df['rate'].notnull()]['rate'].astype('int')

# this is C
C = vote_averages.mean()
m = vote_counts.quantile(0.50)
print(C)
print(m)

3.189391000775795
42.0


In [8]:
data_df['weighted_rating'] = data_df.apply(lambda x: weighted_rating(x, m, C), axis =1)

In [9]:
data_df.head(5)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost_for_two,reviews_list,listed_as,listed_in_city,weighted_rating
0,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,dinner family turned good choose suitable ages...,"Buffet, Dine-out, Delivery","Banashankari, Basavanagudi",4.1
1,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,great proper karnataka style full meals twice ...,"Buffet, Dine-out",Banashankari,3.5
2,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,good restaurant neighbourhood buffet system pr...,"Buffet, Dine-out","Banashankari, Basavanagudi",3.7
3,"19/1, New Timberyard Layout, Beside Satellite ...",Rosewood International Hotel - Bar & Restaurant,No,No,3.6,8,Mysore Road,Casual Dining,"North Indian, South Indian, Andhra, Chinese",800.0,awesome great servicefriendly staffsgood quali...,"Buffet, Dine-out, Delivery",Banashankari,3.3
4,"12,29 Near PES University Back Gate, D'Souza N...",Caf Down The Alley,Yes,No,4.1,402,Banashankari,Cafe,Cafe,500.0,ended saturday afternoon hectic day nthe good ...,"Dine-out, Cafes, Delivery",Banashankari,4.0


In [10]:
cols = ['online_order', 'book_table', 'weighted_rating','location', 'rest_type', 'cuisines', 'cost_for_two', 'reviews_list']
data_df['combined'] = data_df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [12]:
documents = data_df['combined']
count_vectorizer = CountVectorizer(stop_words='english') # convert all words to lowercase and remove stop words
sparse_matrix = count_vectorizer.fit_transform(documents)

In [13]:
similarity_scores = cosine_similarity(sparse_matrix, sparse_matrix)

In [16]:
scores_df = pd.DataFrame(similarity_scores )
scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10302,10303,10304,10305,10306,10307,10308,10309,10310,10311
0,1.000000,0.375806,0.293348,0.219392,0.525118,0.433073,0.493361,0.117755,0.151128,0.385416,...,0.399242,0.355322,0.336975,0.225806,0.627967,0.425539,0.451206,0.528222,0.449073,0.342129
1,0.375806,1.000000,0.178188,0.193109,0.276771,0.251701,0.280580,0.085162,0.179071,0.229709,...,0.304867,0.199107,0.245462,0.124435,0.373239,0.306341,0.340968,0.360438,0.233134,0.299440
2,0.293348,0.178188,1.000000,0.074901,0.245940,0.197439,0.198830,0.070828,0.047901,0.195385,...,0.205948,0.165861,0.154324,0.079882,0.188655,0.127087,0.222650,0.306518,0.114418,0.175828
3,0.219392,0.193109,0.074901,1.000000,0.127436,0.151572,0.231852,0.050713,0.081230,0.069685,...,0.168156,0.130442,0.235297,0.314923,0.286574,0.266066,0.124622,0.172974,0.132228,0.177503
4,0.525118,0.276771,0.245940,0.127436,1.000000,0.458741,0.547442,0.129089,0.144063,0.386574,...,0.356208,0.327929,0.227906,0.168543,0.460699,0.245979,0.383066,0.406917,0.330352,0.302731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10307,0.425539,0.306341,0.127087,0.266066,0.245979,0.274028,0.294032,0.105672,0.147694,0.204052,...,0.193343,0.192227,0.392383,0.376484,0.475602,1.000000,0.283331,0.321111,0.309546,0.269546
10308,0.451206,0.340968,0.222650,0.124622,0.383066,0.260273,0.303690,0.114754,0.113926,0.243816,...,0.309779,0.243653,0.270825,0.204072,0.498271,0.283331,1.000000,0.322201,0.313141,0.264885
10309,0.528222,0.360438,0.306518,0.172974,0.406917,0.351912,0.396841,0.138320,0.114332,0.372480,...,0.363387,0.264448,0.266352,0.192001,0.373623,0.321111,0.322201,1.000000,0.248274,0.327622
10310,0.449073,0.233134,0.114418,0.132228,0.330352,0.274536,0.354753,0.053277,0.051473,0.279316,...,0.198012,0.263471,0.243984,0.264120,0.426100,0.309546,0.313141,0.248274,1.000000,0.214941


In [27]:
def recommend_by_name(name,scores_df, df):
    recommended = []
    name = name.lower()
    index = data_df[data_df['name'].str.lower()==name].index[0]
    top10_list = list(scores_df.iloc[index].sort_values(ascending = False).iloc[1:11].index)
    
    for each in top10_list:
        recommended.append(data_df.iloc[each]['name'])
        
    return recommended

recommend_by_name('Spice Elephant',scores_df, data_df)

['atithi',
 'flavours - octave hotel & spa',
 'the onyx - the hhi select bengaluru',
 'urban tamaasha',
 'jalsa gold',
 'west wood',
 'paprica',
 'cinnamon',
 "palki's",
 'unico']

In [28]:
def recommend_by_resttype(resttype,scores_df, df):
    recommended = []
    resttype = resttype.lower()
    index = data_df[data_df['rest_type'].str.lower()==resttype].index[0]
    top10_list = list(scores_df.iloc[index].sort_values(ascending = False).iloc[1:11].index)
    
    for each in top10_list:
        recommended.append(data_df.iloc[each]['name'])
        
    return recommended

recommend_by_name('Quick Bites',scores_df, data_df)

['five star chicken',
 'habib shaheb kolkatta katti rolls',
 'kushi meals',
 'the hungry',
 'the chaat shop',
 "indian's biriyani point",
 'fahad hotel',
 'amaravati food court',
 'vrs food zone',
 'nisarga restaurant']