In [1]:
import pandas as pd
df1= pd.read_csv("cleaned_dataset.csv")
print("Head")
print(df1.head())

print("Tail")
print(df1.tail())

print("DataFrame Info")
print(df1.info())

print("Missing Value Count")
print(df1.isnull().sum())

print("Unique Value Counts")
print(df1.nunique())

# Descriptive statistics for numerical columns
print("Descriptive Stats(Numerical)")
print(df1.describe())

# Descriptive statistics for Categorical columns
print("Descriptive Stats(Categorical)")
print(df1.describe(include=['object']))

print("\nValue Counts for Country Code(Normalized)")
print(df1['Country Code'].value_counts(normalize=True).mul(100).round(2))

print("\nValue Counts for Rating Text")
print(df1['Rating text'].value_counts())

print("\nValue Counts for Price Range")
print(df1['Price range'].value_counts().sort_index())

print("\nService: Table Booking Counts")
print(df1['Has Table booking'].value_counts())

print("\nService: Online Delivery Counts")
print(df1['Has Online delivery'].value_counts())

print("\nService: Is Delivering Now Counts")
print(df1['Is delivering now'].value_counts())


Head
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong City   
3    

In [2]:

country_map = {
    1: 'India', 14: 'Australia', 30: 'Brazil', 37: 'Canada', 94: 'Indonesia',
    148: 'New Zealand', 162: 'Philippines', 166: 'Qatar', 184: 'Singapore',
    189: 'South Africa', 191: 'Sri Lanka', 208: 'Turkey', 214: 'UAE',
    215: 'United Kingdom', 216: 'United States'
}

# Apply the mapping to create the 'Country Name' column
df1['Country Name'] = df1['Country Code'].map(country_map)

# Analyze the distribution of the new Country Name column
print("\n--- Value Counts for Country Name (Normalized) ---")
print(df1['Country Name'].value_counts(normalize=True).mul(100).round(2))


--- Value Counts for Country Name (Normalized) ---
Country Name
India             90.59
United States      4.54
United Kingdom     0.84
Brazil             0.63
South Africa       0.63
UAE                0.63
New Zealand        0.42
Turkey             0.36
Australia          0.25
Philippines        0.23
Indonesia          0.22
Qatar              0.21
Singapore          0.21
Sri Lanka          0.21
Canada             0.04
Name: proportion, dtype: float64


In [3]:
print("All Unique Cuisines")
unique_cuisines=df1['Cuisines'].unique()
if len(unique_cuisines)> 50:
  print(unique_cuisines[:50])
  print(f"Total Unique Cuisine Combinations: {len(unique_cuisines)}")
else:
  print(unique_cuisines)

# Extract Primary cuisine
df1['Primary Cuisine'] = df1['Cuisines'].apply(lambda x: str(x).split(',')[0])

print(" Primary Cuisines")
print(df1['Primary Cuisine'].value_counts())

All Unique Cuisines
['French, Japanese, Desserts' 'Japanese'
 'Seafood, Asian, Filipino, Indian' 'Japanese, Sushi' 'Japanese, Korean'
 'Chinese' 'Asian, European' 'Seafood, Filipino, Asian, European'
 'European, Asian, Indian' 'Filipino' 'Filipino, Mexican'
 'American, Ice Cream, Desserts' 'Korean'
 'Cafe, American, Italian, Filipino' 'Italian, Pizza'
 'Cafe, Korean, Desserts' 'Cafe, Bakery, American, Italian'
 'Seafood, American, Mediterranean, Japanese'
 'American, Asian, Italian, Seafood' 'Fast Food, French' 'Cafe' 'Bakery'
 'Brazilian' 'Pizza' 'Arabian' 'Brazilian, Cafe' 'Italian'
 'Bar Food, Brazilian' 'Mexican, Grill' 'International'
 'Peruvian, Latin American' 'American, Grill' 'Seafood' 'American, Burger'
 'Seafood, Bar Food, Brazilian' 'Desserts, Cafe' 'Juices, Healthy Food'
 'Beverages, Bar Food, Fast Food' 'Lebanese' 'Burger'
 'Brazilian, Bar Food' 'Brazilian, Healthy Food, Juices, Pizza'
 'Bakery, Sandwich, Brazilian' 'Brazilian, Seafood' 'Steak, BBQ' 'Indian'
 'Beverages, 

In [4]:
# Binary Encoding
binary_cols = ['Has Table booking', 'Has Online delivery', 'Is delivering now']
mapping = {'Yes': 1, 'No': 0}

for col in binary_cols:
  df1[col] = df1[col].map(mapping)

# One-Hot Encoding
categorical_cols = ['Price range','Rating text','Country Name','Primary Cuisine']

df_encoded = pd.get_dummies(df1, columns = categorical_cols, prefix = categorical_cols, drop_first = True)

cols_to_keep=[
    'Restaurant ID', 'Aggregate rating', 'Votes', 'Average Cost for two',
    'Has Table booking', 'Has Online delivery','Is delivering now',
    'Longitude', 'Latitude'
]
encoded_cols = [col for col in df_encoded.columns if any(prefix in col for prefix in categorical_cols)]

final_cols = cols_to_keep + encoded_cols

df_features = df_encoded[final_cols]

# Cleanup to ensure all columns are numeric and fill NaNs
df_features = df_features.apply(pd.to_numeric, errors = 'coerce').fillna(0)


In [5]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Defining columns to be scaled
feature_cols = df_features.columns.drop('Restaurant ID')
# Restaurant ID is a label not a feature, no need to be scaled

scaler = MinMaxScaler()

df_features_scaled_array = scaler.fit_transform(df_features[feature_cols])

# Calculates the min/max for each column(fit) and then scales the data(transform)
# Results the  NumPy array
df_features_scaled = pd.DataFrame(df_features_scaled_array, columns = feature_cols)
# Coverting scaled NumPy array back into pandas dataFrame by re-applying the original feature names as column headers

df_features_scaled.insert(0, 'Restaurant ID', df_features['Restaurant ID'].values)
# Instering 'Restaurant ID' column back into the first position of new scaled dataframe

print(f"Shape: {df_features_scaled.shape}")
print(df_features_scaled.head())

# All columns inside 'df_features_scaled' is our criteria for recommendation

Shape: (9551, 150)
   Restaurant ID  Aggregate rating     Votes  Average Cost for two  \
0        6317637          0.979592  0.028718              0.001375   
1        6304287          0.918367  0.054052              0.001500   
2        6300002          0.897959  0.024694              0.005000   
3        6318506          1.000000  0.033382              0.001875   
4        6314302          0.979592  0.020944              0.001875   

   Has Table booking  Has Online delivery  Is delivering now  Longitude  \
0                1.0                  0.0                0.0   0.838318   
1                1.0                  0.0                0.0   0.838278   
2                1.0                  0.0                0.0   0.838406   
3                0.0                  0.0                0.0   0.838405   
4                1.0                  0.0                0.0   0.838408   

   Latitude  Price range_2  ...  Primary Cuisine_Tea  Primary Cuisine_Tex-Mex  \
0  0.574426            0.0  

In [13]:
# Content-Based Filtering and Recommendation System definition
from sklearn.metrics.pairwise import cosine_similarity
content_features = df_features_scaled.drop(columns=['Restaurant ID']).values
# Isolates 148 feature columns as a NumPy array because Cosine Similarity is NumPy Operation
cosine_sim = cosine_similarity(content_features)
print(f"Shape of Similarity Matrix: {cosine_sim.shape}")

# Create index mapping (Restaurant ID --> Index in Matrix)
indices = pd.Series(df_features_scaled.index, index=df_features_scaled['Restaurant ID']).drop_duplicates()

def get_recommendations(restaurant_id, cosine_sim=cosine_sim, df1=df1, indices=indices, num_recommendations=20):
  if restaurant_id not in indices:
    return f"Error: Restaurant ID {restaurant_id} not found."
  # Get the index of the input restaurant
  idx= indices[restaurant_id]
  # Get the similarity scores for that restaurant
  sim_scores = list(enumerate(cosine_sim[idx]))
  # Sort the restaurants based on similarity scores(descending)
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  # Exclude the input restaurant itself and keep top N
  sim_scores = sim_scores[1:num_recommendations+1]

  # Get the restaurant indices and scores
  restaurant_indices = [i[0] for i in sim_scores]
  similarity_scores = [i[1] for i in sim_scores]
  # Above two variables are lists that loaded with indices and scores respectively

  # Retrieve restaurant details from original dataframe 'df1'
  recommendations = df1.iloc[restaurant_indices][['Restaurant ID', 'Restaurant Name', 'Primary Cuisine', 'Aggregate rating', 'Votes', 'Price range']].copy()
  recommendations['Similarity Score'] = similarity_scores

  return recommendations


Shape of Similarity Matrix: (9551, 9551)


In [14]:
# Testing the Recommendation System

# Candidate A :- High-rated, Overall popular restaurant
candidate_A_data = df1.sort_values(by=['Aggregate rating','Votes'], ascending=False).iloc[0]
candidate_A_id = candidate_A_data['Restaurant ID']
candidate_A_name = candidate_A_data['Restaurant Name']
candidate_A_cuisine = candidate_A_data['Primary Cuisine']

# Candidate B :- Indian restaurant with high votes and good rating
indian_restaurants = df1[(df1['Primary Cuisine'] == 'Indian') & (df1['Aggregate rating']>4.0)]
candidate_B_data = indian_restaurants.sort_values(by='Votes', ascending = False).iloc[0]
candidate_B_id = candidate_B_data['Restaurant ID']
candidate_B_name = candidate_B_data['Restaurant Name']
candidate_B_cuisine = candidate_B_data['Primary Cuisine']

# Candidate C :- European restaurant to test a different market
european_restaurants = df1[(df1['Primary Cuisine'] == 'European') & (df1['Aggregate rating']>4.0)]
candidate_C_data = european_restaurants.sort_values(by='Votes', ascending = False).iloc[0]
candidate_C_id = candidate_C_data['Restaurant ID']
candidate_C_name = candidate_C_data['Restaurant Name']
candidate_C_cuisine = candidate_C_data['Primary Cuisine']

# Candidate D
complex_filter = (
    (df1['Aggregate rating'] >= 4.5) &
    (df1['Price range'] == 4) &
    (df1['Has Table booking'] == 1) &
    (df1['Primary Cuisine'].isin(['Italian', 'European', 'International', 'French']))
)
candidate_D_restaurants = df1[complex_filter]
# Sort by Votes to pick the most established restaurant within this strict criteria
candidate_D_data = candidate_D_restaurants.sort_values(by='Votes', ascending=False).iloc[0]
candidate_D_id = candidate_D_data['Restaurant ID']
candidate_D_name = candidate_D_data['Restaurant Name']
candidate_D_cuisine = candidate_D_data['Primary Cuisine']
candidate_D_price = candidate_D_data['Price range']

print("\n----Test Candidates for Restaurant Recommendation----")
print(f"1. Overall Popular: {candidate_A_name} (ID: {candidate_A_id}, Cuisine: {candidate_A_cuisine})")
print(f"2. Indian Cuisine: {candidate_B_name} (ID: {candidate_B_id}, Cuisine: {candidate_B_cuisine})")
print(f"3. European Cuisine: {candidate_C_name} (ID: {candidate_C_id}, Cuisine: {candidate_C_cuisine})")
print(f"4. Complex High-End: {candidate_D_name} (ID: {candidate_D_id}, Cuisine: {candidate_D_cuisine}, Price: {candidate_D_price})")
print("-------------------------------------------------------")

# Running test recommendations for Candidate A
print(f"\nRunning test for Candidate A: {candidate_A_name} (ID: {candidate_A_id})")
test_recommedations_A = get_recommendations(candidate_A_id)
print("\n--- Top 20 Recommendations for Candidate A(Content-Based) ---")
print(test_recommedations_A.to_markdown(index = False))

# Running test recommendations for Candidate B
print(f"\nRunning test for Candidate B: {candidate_B_name} (ID: {candidate_B_id})")
test_recommedations_B = get_recommendations(candidate_B_id)
print("\n--- Top 20 Recommendations for Candidate B(Content-Based) ---")
print(test_recommedations_B.to_markdown(index = False))

# Running test recommendations for Candidate C
print(f"\nRunning test for Candidate C: {candidate_C_name} (ID: {candidate_C_id})")
test_recommedations_C = get_recommendations(candidate_C_id)
print("\n--- Top 20 Recommendations for Candidate C(Content-Based) ---")
print(test_recommedations_C.to_markdown(index = False))

# Running the test recommendations for Candidate D
print(f"\nRunning test for Complex Candidate D: {candidate_D_name} (ID: {candidate_D_id})")
test_recommendations = get_recommendations(candidate_D_id)
print("\n--- Top 20 Recommendations for Candidate D(Content-Based) ---")
print(test_recommendations.to_markdown(index=False))


----Test Candidates for Restaurant Recommendation----
1. Overall Popular: Barbeque Nation (ID: 20842, Cuisine: North Indian)
2. Indian Cuisine: AB's Absolute Barbecues (ID: 208939, Cuisine: Indian)
3. European Cuisine: AB's - Absolute Barbecues (ID: 56618, Cuisine: European)
4. Complex High-End: Spiral - Sofitel Philippine Plaza Manila (ID: 6300010, Cuisine: European, Price: 4)
-------------------------------------------------------

Running test for Candidate A: Barbeque Nation (ID: 20842)

--- Top 20 Recommendations for Candidate A(Content-Based) ---
|   Restaurant ID | Restaurant Name            | Primary Cuisine   |   Aggregate rating |   Votes |   Price range |   Similarity Score |
|----------------:|:---------------------------|:------------------|-------------------:|--------:|--------------:|-------------------:|
|        17806994 | Mirchi And Mime            | North Indian      |                4.9 |    3244 |             3 |           0.994997 |
|           11807 | Barbeque 