In [417]:
import pandas as pd
import numpy as np
import random as rd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [418]:
#reading csv
restaurants = pd.read_csv(r"data\swiggy_cleaned.csv")

In [419]:
#Constants here -
MIN_VOTE_COUNT = int(restaurants['Number of Ratings'].quantile(0.85))
AVERAGE_RATING = restaurants['Rating'].mean()

print(MIN_VOTE_COUNT, AVERAGE_RATING)



100 3.427623107428994


In [420]:
# calculate revised ratings based on Average ratings and total number of ratings
def revised_ratings(x : pd.DataFrame, 
                    avg_rating : float = AVERAGE_RATING, 
                    min_votecount : float = MIN_VOTE_COUNT ) -> pd.Series:
    """
    returns a revised ratings based on average user rating and number of ratings
    """
    return (x['Number of Ratings']*x['Rating']/(x['Number of Ratings']+MIN_VOTE_COUNT)) + (min_votecount*avg_rating/(x['Number of Ratings']+MIN_VOTE_COUNT))


restaurants['Revised Rating'] = restaurants.apply(revised_ratings, axis=1)


In [421]:
#remove rows with negligible number of ratings - default <85P ratings and revised rating >= 3
restaurants.drop(restaurants[(restaurants['Number of Ratings'] < MIN_VOTE_COUNT) | (restaurants['Revised Rating'] <= 3.5)].index, inplace=True)
restaurants.reset_index(drop=True, inplace=True)

restaurants.to_csv('models/restaurants.csv', index=False)

# Top rated restaurants in the area

In [433]:
#randomly selecting a Location from the list - emulating user selection
def get_location()-> str:
    """
    Returns a random location string from Location column
    """
    return rd.sample(restaurants['Location'].unique().tolist(), 1)[0]

LOCATION = get_location()

#randomly selecting a restaurant from the current location - emulating user last order
def get_restaurant()-> str:
    """
    Returns a random restaurant name given the Location
    """
    return rd.sample(restaurants[restaurants['Location'] == LOCATION]['Restaurant Name'].tolist(), 1)[0]
    
RESTAURANT = get_restaurant()

In [423]:
# Candidate selection
print(LOCATION, RESTAURANT)
all_candidates = restaurants[restaurants['Location'].str.contains(LOCATION)]

# Candidate ranking
candidates = all_candidates.sort_values(by='Revised Rating', ascending=False)

# Candidate filtering
top_in_area = candidates.drop(columns=['Location','Area','Revised Rating', 'Number of Ratings']).head(5)
top_in_area

Kanchrapara Sahjadi Restora


Unnamed: 0,Restaurant Name,Cuisine,Rating,Average Price,Pure Veg
13283,Chaurasia Pan Shop,paan,4.7,100,nonveg
13125,Vip Sweets Pvt. Ltd.,sweets,4.7,200,nonveg
13204,Land Of Cakes,"bakery,desserts",4.7,200,nonveg
13234,Jain Sweets,"sweets,beverages",4.6,100,nonveg
13139,Keventers Ice Cream,"icecream,desserts",4.6,200,nonveg


# Based on the previous orders

considering only Pure Veg and cuisines as features. Let's take only those restaurants that has revised rating >=3.5

In [425]:
#Creating a index to the restaurant name for each location
DATA_INDEX = restaurants[['Restaurant Name', 'Location']]

In [426]:
# Make feature string - combining Cuisines and Pure Veg column
def make_string(x):
    return " ".join(x['Cuisine'].split(',')) + " " + x['Pure Veg']

restaurants['Cuisine Feature'] = restaurants.apply(make_string, axis=1)

#Drop unnecessary columns
restaurants.drop(columns=['Cuisine','Pure Veg', 'Area','Rating','Number of Ratings'], inplace=True)

In [427]:
restaurants.head()

Unnamed: 0,Restaurant Name,Average Price,Location,Revised Rating,Cuisine Feature
0,Roll Express,200,Abohar,3.863812,fastfood snacks veg
1,Grill Masters,250,Abohar,3.763812,italianamerican fastfood veg
2,Snakkers,200,Abohar,3.813812,burgers pastas veg
3,Picado International Food,300,Abohar,3.863812,pizzas beverages nonveg
4,Grill Master,250,Abohar,4.071271,pizzas indian veg


In [428]:
#vectorize feature - converting each feature string as a sparse metrix using vocabulary of the whole feature colunmn
vectorizer = CountVectorizer(stop_words='english')
vector_matrix = vectorizer.fit_transform(restaurants['Cuisine Feature'])

#construct similarity matrix usine cosine similarity
RESTAURANT_SIMILARITY = cosine_similarity(vector_matrix)

In [430]:
#filter for region
def filter_on_location():
    """
    Returns a list of indices - restaurants in the current area for restaurant index
    """
    return DATA_INDEX[DATA_INDEX['Location'] == LOCATION].index


def get_recommendations(top_n : int = 4) -> pd.DataFrame:
    """
    Returns a frame of recommended restaurants based on the cosine similarity of restaurants in the area

    1. recommend - number of items to recommend
    """
    restaurant_idx  = DATA_INDEX[(DATA_INDEX['Restaurant Name'] == RESTAURANT) & (DATA_INDEX['Location'] == LOCATION)].index[0]
    restaurants_in_area = filter_on_location()

    similarity_for_X = list(enumerate(RESTAURANT_SIMILARITY[restaurant_idx]))

    similarity_in_area = [ x for x in similarity_for_X if x[0] in restaurants_in_area]
    similarity_in_area = sorted(similarity_in_area, key= lambda x: x[1], reverse=True)

    return DATA_INDEX.iloc[[item[0] for item in similarity_in_area[:top_n]]]

def randomize_recommendations(top_recommends : pd.DataFrame, recommend : int = 4) -> pd.DataFrame:
    """ 
    Returns random 'n' samples as final recommendation
    
    1. top_recommends - top_n recommendation as calculated 
    2. recommend - number of samples to return
    """
    return top_recommends.sample(recommend)

In [434]:
print('Because you ordered from {} in {}'.format(RESTAURANT, LOCATION))
top_recommendations = get_recommendations()
recommendations_randomized = randomize_recommendations(top_recommendations)
print('we recommend \n {}'.format( "\n".join(recommendations_randomized['Restaurant Name'].tolist())))

Because you ordered from Sangeetha Veg Restaurant in Thiruvarur
we recommend 
 Manikandan Mess
Hotel Saravana Bhavan
Kamatchi Amman Hotel &Sweets
Sangeetha Veg Restaurant


In [432]:
#dump DATA-INDEX and similarity matrix, and vectorizer object

import pickle

with open('models/vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

with open('models/vectors.pkl', 'wb') as file:
    pickle.dump(vector_matrix,file)

with open('models/restaurant_sim.pkl', 'wb') as file:
    pickle.dump(RESTAURANT_SIMILARITY, file)
