# Phase 1: Data Ingestion and Preprocessing

Load the Dataset

In [1]:
import pandas as pd
df = pd.read_csv('zomato.csv', encoding='latin-1')

Clean the Data

In [6]:
print(df.columns.tolist())

['Restaurant Name', 'City', 'Locality', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Price range', 'Aggregate rating', 'Votes']


In [5]:
# Drop unnecessary columns
df.drop(columns=[
    'Restaurant ID',
    'Country Code',
    'Address',
    'Locality Verbose',
    'Switch to order menu',
    'Rating color',
    'Rating text'
], inplace=True, errors='ignore')

# Drop duplicates
df.drop_duplicates(inplace=True)

# Drop rows with missing important info (note the correct column names)
df.dropna(subset=['Cuisines', 'Locality', 'Average Cost for two', 'Aggregate rating'], inplace=True)


Normalize Categorical Values

In [8]:
# Normalize cuisine text
df['Cuisines'] = df['Cuisines'].str.lower().str.strip()

df['Cuisines'] = df['Cuisines'].str.replace('chinese food', 'chinese', regex=False)
df['Cuisines'] = df['Cuisines'].str.replace('italian cuisine', 'italian', regex=False)
df['Cuisines'] = df['Cuisines'].str.replace('north indian', 'indian', regex=False)
df['Cuisines'] = df['Cuisines'].str.replace('south indian', 'indian', regex=False)


Convert Numeric Fields

In [10]:
df['Aggregate rating'] = pd.to_numeric(df['Aggregate rating'], errors='coerce')

#  Phase 2: Recommendation Engine

Feature Engineering

In [12]:
def cost_bucket(x):
    if x < 300:
        return 'Low'
    elif x <= 700:
        return 'Medium'
    else:
        return 'High'

df['cost_category'] = df['Average Cost for two'].apply(cost_bucket)


In [14]:
df['primary_cuisine'] = df['Cuisines'].apply(lambda x: x.split(',')[0].strip() if pd.notnull(x) else x)

Filtering and Ranking

In [15]:
def filter_and_rank(user_cuisine, user_budget, user_location, top_n=10):
    filtered = df[
        df['primary_cuisine'].str.contains(user_cuisine.lower(), na=False) &
        (df['cost_category'] == user_budget) &
        (df['location'].str.contains(user_location, case=False, na=False))
    ]

    filtered['score'] = (
        filtered['aggregate_rating'].fillna(0) * 0.7 +
        (filtered['votes'].fillna(0) / filtered['votes'].max()) * 0.3
    )

    filtered = filtered.sort_values(by='score', ascending=False).head(top_n)
    return filtered


Explanation Generator

In [16]:
def explain(row):
    return f"Matched on {row['primary_cuisine']} cuisine and ₹{row['average_cost_for_two']} budget with {row['aggregate_rating']} rating"