# <span style='color:DodgerBlue'> Task2: Restaurant Recommendation </span>

### <span style='color:Orange'> Objective: </span> Create a restaurant recommendation system based on user preferences.

### Import the libraries and packages

In [None]:
## basic import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## preprocessing import
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
df = pd.read_csv('Dataset.csv')
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [3]:
df.shape

(9551, 21)

#### checking null values

In [4]:
df.isna().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

In [5]:
## ==> only cuisine has null values(only 9)
## way 1. drop these rows form the dataset(not as much important for predict the ratings, and low in numbers)
## way 2. you have to replace the most frequent cuisine values for that location / city

df['Cuisines'] = df.groupby('City')['Cuisines'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 'Unknown'))


#### check duplicated rows/records

In [6]:
df.duplicated().sum()

0

#### check datatypes of all the columns

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9551 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

#### drop irrelevant columns

In [8]:
df = df.drop(columns=['Restaurant ID','Locality','Locality Verbose','Currency','Switch to order menu','Rating color','Rating text','Is delivering now','Price range'])

In [9]:
## preprocessing of categorical column with 2 values {yes:1 and no:0}

df['Has Table booking'] = df['Has Table booking'].map({'Yes': 1, 'No': 0})
df['Has Online delivery'] = df['Has Online delivery'].map({'Yes': 1, 'No': 0})

In [10]:
df.head()

Unnamed: 0,Restaurant Name,Country Code,City,Address,Longitude,Latitude,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Aggregate rating,Votes
0,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...",121.027535,14.565443,"French, Japanese, Desserts",1100,1,0,4.8,314
1,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...",121.014101,14.553708,Japanese,1200,1,0,4.5,591
2,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,1,0,4.4,270
3,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...",121.056475,14.585318,"Japanese, Sushi",1500,0,0,4.9,365
4,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...",121.057508,14.58445,"Japanese, Korean",1500,1,0,4.8,229


#### check Datatypes of columns

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant Name       9551 non-null   object 
 1   Country Code          9551 non-null   int64  
 2   City                  9551 non-null   object 
 3   Address               9551 non-null   object 
 4   Longitude             9551 non-null   float64
 5   Latitude              9551 non-null   float64
 6   Cuisines              9551 non-null   object 
 7   Average Cost for two  9551 non-null   int64  
 8   Has Table booking     9551 non-null   int64  
 9   Has Online delivery   9551 non-null   int64  
 10  Aggregate rating      9551 non-null   float64
 11  Votes                 9551 non-null   int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 895.5+ KB


#### Modified Cuisine into general_cuisine

In [12]:
def map_cuisine_to_category(cuisine):
    
    indian_cuisines = [
        "curry", "south indian", "andhra", "maharashtrian", "north indian", "naga",
        "gujarati", "kashmiri", "hyderabadi", "bengali", "goan", "awadhi", "rajasthani",
        "chettinad", "mithai", "lucknowi", "mangalorean", "malwani", "indian", "biryani",
        "oriya", "modern indian", "bihari", "assamese", "north eastern", "healthy food", "vegetarian", "indian"
    ]
    east_asian_cuisines = [
        "chinese", "japanese", "korean", "mongolian", "hong kong cuisine", "cantonese",
        "ramen", "teriyaki", "taiwanese"
    ]
    southeast_asian_cuisines = [
        "thai", "vietnamese", "singaporean", "filipino", "indonesian", "burmese",
        "peranakan", "malaysian", "laotian", "cambodian", "sri lankan"
    ]
    south_asian_cuisines = [
        "nepalese", "afghan", "pakistani", "maldivian", "thakali"
    ]
    central_asian_cuisines = [
        "kazakh", "uzbek", "turkmen", "manti", "shashlik"
    ]
    middle_eastern_cuisines = [
        "armenian", "iranian", "persian", "turkish", "lebanese", "arabian",
        "yemeni", "palestinian", "israeli", "syrian", "iraqi", "jordanian",
        "borek", "kebab", "izgara"
    ]
    western_european_cuisines = [
        "french", "italian", "spanish", "greek", "portuguese", "german", "belgian",
        "irish", "british", "scottish", "mediterranean", "continental", "european", "western",
        "austrian", "swiss", "dutch"
    ]
    eastern_european_cuisines = [
        "polish", "hungarian", "czech"
    ]
    northern_european_cuisines = [
        "danish", "swedish", "finnish"
    ]
    north_american_cuisines = [
        "tex-mex", "american", "diner", "steak", "pub food", "charcoal grill", "new american",
        "bar food", "deli", "canadian", "cajun", "hawaiian"
    ]
    latin_american_cuisines = [
        "mexican", "brazilian", "argentine", "peruvian", "cuban", "latin american",
        "mineira", "southern", "caribbean", "soul food", "chilean", "colombian", "venezuelan",
        "ecuadorian", "uruguayan"
    ]
    african_cuisines = [
        "south african", "durban", "ethiopian", "nigerian", "ghanaian", "kenyan"
    ]
    north_african_cuisines = [
        "moroccan", "algerian", "tunisian", "egyptian", "libyan", "sudanese"
    ]
    australian_and_pacific_cuisines = [
        "australian", "modern australian", "kiwi", "new zealand", "fijian", "polynesian",
        "papua new guinean"
    ]
    international_and_fusion_cuisines = [
        "fusion", "international", "contemporary", "raw meats", "world cuisine",
        "cuisine varies", "street food"
    ]
    desserts_and_bakery = [
        "desserts", "ice cream", "bakery", "patisserie", "mithai"
    ]
    seafood = [
        "seafood", "fish and chips", "sushi"
    ]
    beverages = [
        "tea", "coffee and tea", "juices", "beverages", "drinks only", "bubble tea",
        "smoothies", "shakes"
    ]
    fastfood = [
        "pizza", "burger", "fast food", "sandwich", "turkish pizza", "breakfast",
        "restaurant cafe", "grill", "wraps", "hotdogs", "gourmet fast food", "finger food"
    ]

    if cuisine in indian_cuisines:
        return 'indian'
    elif cuisine in east_asian_cuisines:
        return 'east_asian'
    elif cuisine in southeast_asian_cuisines:
        return 'southeast_asian'
    elif cuisine in south_asian_cuisines:
        return 'south_asian'
    elif cuisine in central_asian_cuisines:
        return 'central_asian'
    elif cuisine in middle_eastern_cuisines:
        return 'middle_eastern'
    elif cuisine in western_european_cuisines:
        return 'western_european'
    elif cuisine in eastern_european_cuisines:
        return 'eastern_european'
    elif cuisine in northern_european_cuisines:
        return 'northern_european'
    elif cuisine in north_american_cuisines:
        return 'north_american'
    elif cuisine in latin_american_cuisines:
        return 'latin_american'
    elif cuisine in african_cuisines:
        return 'african'
    elif cuisine in north_african_cuisines:
        return 'north_african'
    elif cuisine in australian_and_pacific_cuisines:
        return 'australian_and_pacific'
    elif cuisine in international_and_fusion_cuisines:
        return 'international_and_fusion_cuisines'
    elif cuisine in desserts_and_bakery:
        return 'desserts_and_bakery'
    elif cuisine in seafood:
        return 'seafood'
    elif cuisine in beverages:
        return 'beverages'
    elif cuisine in fastfood:
        return 'fastfood'
    else:
        return 'other'

In [None]:
def preprocess_cuisines(row):
    cuisines = [cuisine.strip().lower() for cuisine in row.split(', ')]
    generalized_cuisines = [map_cuisine_to_category(cuisine) for cuisine in cuisines]
    return ', '.join(set(generalized_cuisines))

df['Cuisine_varieties'] = df['Cuisines'].apply(preprocess_cuisines)
df = df.drop(columns=['Cuisines'], axis=1)
df.head()

Unnamed: 0,Restaurant Name,Country Code,City,Address,Longitude,Latitude,Average Cost for two,Has Table booking,Has Online delivery,Aggregate rating,Votes,Cuisine_varieties
0,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...",121.027535,14.565443,1100,1,0,4.8,314,"western_european, east_asian, desserts_and_bakery"
1,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...",121.014101,14.553708,1200,1,0,4.5,591,east_asian
2,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...",121.056831,14.581404,4000,1,0,4.4,270,"other, southeast_asian, seafood, indian"
3,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...",121.056475,14.585318,1500,0,0,4.9,365,"east_asian, seafood"
4,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...",121.057508,14.58445,1500,1,0,4.8,229,east_asian


#### OneHot Encoding for Cuisine_varieties

In [None]:
mlb = MultiLabelBinarizer()

def one_hot_encoded_cuisine(df, cuisine_column, save_path="OneHot_Encoder(mlb).pkl", is_training=True):
    
    global mlb

    df["cuisines_list"] = df[cuisine_column].apply(lambda x: [c.strip() for c in x.split(', ')])

    if is_training:
        one_hot = mlb.fit_transform(df["cuisines_list"])
        with open(save_path, "wb") as f:
            pickle.dump(mlb, f)
    else:
        with open(save_path, "rb") as f:
            mlb = pickle.load(f)

        one_hot = mlb.transform(df["cuisines_list"])

    one_hot_df = pd.DataFrame(one_hot, columns=mlb.classes_, index=df.index)

    df = df.drop(columns=[cuisine_column, 'cuisines_list'], axis=1)

    return pd.concat([df, one_hot_df], axis=1)


In [15]:
## I am not saving this multilabelBinarizer because I saved earlier for project 1.
## so I am using that object only. so thats why i use is_training_parameter=False

df = one_hot_encoded_cuisine(df, 'Cuisine_varieties',is_training=False)
df.shape
df.head()

Unnamed: 0,Restaurant Name,Country Code,City,Address,Longitude,Latitude,Average Cost for two,Has Table booking,Has Online delivery,Aggregate rating,...,international_and_fusion_cuisines,latin_american,middle_eastern,north_african,north_american,other,seafood,south_asian,southeast_asian,western_european
0,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...",121.027535,14.565443,1100,1,0,4.8,...,0,0,0,0,0,0,0,0,0,1
1,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...",121.014101,14.553708,1200,1,0,4.5,...,0,0,0,0,0,0,0,0,0,0
2,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...",121.056831,14.581404,4000,1,0,4.4,...,0,0,0,0,0,1,1,0,1,0
3,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...",121.056475,14.585318,1500,0,0,4.9,...,0,0,0,0,0,0,1,0,0,0
4,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...",121.057508,14.58445,1500,1,0,4.8,...,0,0,0,0,0,0,0,0,0,0


#### Normalize Numerical Features 'Aggregate rating', 'Votes', 'Average Cost for two'

In [None]:
scaler = StandardScaler()

columns_to_scale = ['Average Cost for two', 'Aggregate rating', 'Votes']
scaler.fit(df[columns_to_scale])

df[columns_to_scale] = scaler.transform(df[columns_to_scale])

df.head()

Unnamed: 0,Restaurant Name,Country Code,City,Address,Longitude,Latitude,Average Cost for two,Has Table booking,Has Online delivery,Aggregate rating,...,international_and_fusion_cuisines,latin_american,middle_eastern,north_african,north_american,other,seafood,south_asian,southeast_asian,western_european
0,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...",121.027535,14.565443,-0.006154,1,0,1.407131,...,0,0,0,0,0,0,0,0,0,1
1,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...",121.014101,14.553708,4.9e-05,1,0,1.209281,...,0,0,0,0,0,0,0,0,0,0
2,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...",121.056831,14.581404,0.173743,1,0,1.143331,...,0,0,0,0,0,1,1,0,1,0
3,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...",121.056475,14.585318,0.018659,0,0,1.473081,...,0,0,0,0,0,0,1,0,0,0
4,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...",121.057508,14.58445,0.018659,1,0,1.407131,...,0,0,0,0,0,0,0,0,0,0


In [17]:
with open('Scaler(AggRat_votes_avgCost42).pkl','wb') as file:
    pickle.dump(scaler,file)

In [18]:
df.columns

Index(['Restaurant Name', 'Country Code', 'City', 'Address', 'Longitude',
       'Latitude', 'Average Cost for two', 'Has Table booking',
       'Has Online delivery', 'Aggregate rating', 'Votes', 'african',
       'australian_and_pacific', 'beverages', 'desserts_and_bakery',
       'east_asian', 'fastfood', 'indian', 'international_and_fusion_cuisines',
       'latin_american', 'middle_eastern', 'north_african', 'north_american',
       'other', 'seafood', 'south_asian', 'southeast_asian',
       'western_european'],
      dtype='object')

## Content Based Filtering

#### Data Preparation

In [19]:
relevant_features = ['Average Cost for two', 'Aggregate rating', 'Votes',
    'african', 'australian_and_pacific', 'beverages', 'desserts_and_bakery',
    'east_asian', 'fastfood', 'indian', 'international_and_fusion_cuisines',
    'latin_american', 'middle_eastern', 'north_african', 'north_american',
    'other', 'seafood', 'south_asian', 'southeast_asian', 'western_european']

feature_matrix = df[relevant_features]

feature_matrix

Unnamed: 0,Average Cost for two,Aggregate rating,Votes,african,australian_and_pacific,beverages,desserts_and_bakery,east_asian,fastfood,indian,international_and_fusion_cuisines,latin_american,middle_eastern,north_african,north_american,other,seafood,south_asian,southeast_asian,western_european
0,-0.006154,1.407131,0.365202,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1
1,0.000049,1.209281,1.009168,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0.173743,1.143331,0.262911,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0
3,0.018659,1.473081,0.483766,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
4,0.018659,1.407131,0.167595,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9546,-0.069428,0.945480,1.467151,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9547,-0.067878,1.011430,2.039049,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
9548,-0.063845,0.681680,1.171903,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
9549,-0.066947,0.879530,1.729853,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


## Build the Recommendation Model

In [None]:
## compute cosine similarity
cosine_sim = cosine_similarity(feature_matrix)

### Save the Recommendation Model

In [None]:
with open('Restaurant_Recommendation_Model.pkl','wb') as f:
    pickle.dump((scaler,cosine_sim,df),f)

### Testing Via Single Instance

In [None]:
## load the model
with open('Restaurant_Recommendation_Model.pkl', 'rb') as f:
    scaler, cosine_sim, df = pickle.load(f)

In [36]:
def one_hot_encode_user_cuisines(user_cuisines, mlb_path="OneHot_Encoder(mlb).pkl"):
    with open(mlb_path, "rb") as f: 
        mlb = pickle.load(f)
    cuisine_list = [cuisine.strip().lower() for cuisine in user_cuisines.split(', ')]
    encoded = mlb.transform([cuisine_list]) 
    return encoded[0]

def recommend_restaurants(preferences, top_n=5):

    if isinstance(preferences, pd.DataFrame):
        cuisines = preferences['cuisines'].iloc[0] 
    else:
        cuisines = preferences['cuisines']
    
    ## Preprocess cuisines
    generalize_cuisines = preprocess_cuisines(cuisines) 
    encoded_cuisine = one_hot_encode_user_cuisines(generalize_cuisines) 

    ## Convert preferences to a DataFrame for scaling
    if isinstance(preferences, dict):
        numerical_data = pd.DataFrame([{
            'Average Cost for two': preferences['Average Cost for two'],
            'Aggregate rating': preferences['Aggregate rating'],
            'Votes': preferences['Votes']
        }])
    else:  # If preferences is DataFrame, select numerical columns
        numerical_data = preferences[['Average Cost for two', 'Aggregate rating', 'Votes']]
    
    ## Scale numerical data
    scaled_numerical_data = scaler.transform(numerical_data)
    scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=['Average Cost for two', 'Aggregate rating', 'Votes'])
    
    ## Combine scaled numerical data and encoded cuisines
    encoded_cuisine_df = pd.DataFrame([encoded_cuisine])
    user_preferences = pd.concat([scaled_numerical_df, encoded_cuisine_df], axis=1)
    
    ## Compute similarity scores
    similarity_scores = cosine_similarity(user_preferences, feature_matrix)
    
    ## Get top N recommendations
    recommendations = df.iloc[similarity_scores[0].argsort()[-top_n:][::-1]]

    recommendations[columns_to_scale] = scaler.inverse_transform(recommendations[columns_to_scale])

    return recommendations[['Restaurant Name', 'City', 'Longitude', 'Latitude', 'Average Cost for two', 'Aggregate rating', 'Votes']]



In [43]:
## test the model
user_preferences = {
    "Average Cost for two":300,
    "Aggregate rating": 4.2,
    "Votes":100,
    "cuisines":"Desserts, Burger, tea"
}

recommendation = recommend_restaurants(pd.DataFrame([user_preferences]))
print(recommendation)

             Restaurant Name        City  Longitude   Latitude  \
4496              Frozen Pan   New Delhi  77.207386  28.680921   
8426            Baker Street       Noida  77.322137  28.573020   
799             Super Donuts  Chandigarh  76.760283  30.721380   
4914  Chawla's The Cake Room   New Delhi  77.279485  28.639850   
3613                Himcream   New Delhi  77.274980  28.650927   

      Average Cost for two  Aggregate rating  Votes  
4496                 250.0               3.9   72.0  
8426                 300.0               3.6   40.0  
799                  450.0               4.0  265.0  
4914                 250.0               3.4   17.0  
3613                 300.0               3.3   17.0  
