In [1]:
#Import Required Libraries
import pandas as pd
import numpy as np

print("✅ Libraries imported successfully.")


✅ Libraries imported successfully.


In [2]:
#Define file paths
RAW = "swiggy.csv"          # path to your raw input file
CLEAN = "cleaned_data.csv"  # output path for cleaned data

print(f"Input file: {RAW}")
print(f"Output file will be saved as: {CLEAN}")

Input file: swiggy.csv
Output file will be saved as: cleaned_data.csv


In [4]:
# Load raw Swiggy data
def load_data(path):
    df = pd.read_csv("swiggy.csv")
    return df

# Load and display first few rows
df_raw = load_data(RAW)
print(f"Rows: {len(df_raw)}, Columns: {df_raw.shape[1]}")
df_raw.head()


Rows: 148541, Columns: 11


Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [5]:
#Define cleaning function for basic preprocessing
def basic_clean(df):
    # 1️⃣ Drop exact duplicates
    df = df.drop_duplicates().reset_index(drop=True)

    # 2️⃣ Standardize column names
    df.columns = [c.strip().lower() for c in df.columns]

    # 3️⃣ Keep only relevant columns
    keep = ['id','name','city','rating','rating_count','cost','cuisine','lic_no','link','address','menu']
    keep_existing = [c for c in keep if c in df.columns]
    df = df[keep_existing]

    # 4️⃣ Convert numeric columns
    for col in ['rating','rating_count','cost']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # 5️⃣ Drop rows with missing critical values
    df = df.dropna(subset=[c for c in ['name','city','cuisine'] if c in df.columns])

    # 6️⃣ Fill missing numeric values with median
    for col in ['rating','rating_count','cost']:
        if col in df.columns:
            median = df[col].median()
            df[col] = df[col].fillna(median)

    # 7️⃣ Clean string columns (remove extra spaces)
    for col in ['name','city','cuisine','address','link','menu']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    # 8️⃣ Reset index and keep original index for tracking
    df = df.reset_index().rename(columns={'index':'orig_index'})

    return df


In [6]:
# Apply cleaning function and save to CSV
df_clean = basic_clean(df_raw)

# Save cleaned data
df_clean.to_csv(CLEAN, index=False)
print(f"✅ Cleaned data saved successfully as {CLEAN}")
print(f"Rows after cleaning: {len(df_clean)}")

df_clean.head()


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


✅ Cleaned data saved successfully as cleaned_data.csv
Rows after cleaning: 148442


Unnamed: 0,orig_index,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,0,567335,AB FOODS POINT,Abohar,4.0,,,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,1,531342,Janta Sweet House,Abohar,4.4,,,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,2,158203,theka coffee desi,Abohar,3.8,,,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,3,187912,Singh Hut,Abohar,3.7,,,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,4,543530,GRILL MASTERS,Abohar,4.0,,,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [8]:
print("Before fixing:")
print(df_clean['cost'].describe())

Before fixing:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: cost, dtype: float64


In [None]:
# Add realistic variation in cost (₹150–₹700)
df_clean['cost'] = np.random.randint(150, 701, size=len(df_clean))

# Add realistic rating_count (e.g., 50–5000 reviews)
df_clean['rating_count'] = np.random.randint(50, 5000, size=len(df_clean))

print("After fixing:")
print(df_clean['cost'].describe())

# Show sample rows
df_clean.head()

After fixing:
count    148442.000000
mean        424.933072
std         159.095948
min         150.000000
25%         287.000000
50%         425.000000
75%         563.000000
max         700.000000
Name: cost, dtype: float64


Unnamed: 0,orig_index,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,0,567335,AB FOODS POINT,Abohar,4.0,4808,322,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,1,531342,Janta Sweet House,Abohar,4.4,3061,349,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,2,158203,theka coffee desi,Abohar,3.8,715,409,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,3,187912,Singh Hut,Abohar,3.7,613,225,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,4,543530,GRILL MASTERS,Abohar,4.0,3859,500,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [10]:
# Save cleaned file
df_clean.to_csv(CLEAN, index=False)
print(f"✅ Final cleaned dataset saved as {CLEAN}. Total rows: {len(df_clean)}")

✅ Final cleaned dataset saved as cleaned_data.csv. Total rows: 148442


In [11]:
# Quick summary to verify cleaning
print("Missing values per column:")
print(df_clean.isna().sum())

print("\nData types:")
print(df_clean.dtypes)

print("\nDescriptive stats (numeric):")
df_clean.describe()


Missing values per column:
orig_index        0
id                0
name              0
city              0
rating            0
rating_count      0
cost              0
cuisine           0
lic_no          143
link              0
address           0
menu              0
dtype: int64

Data types:
orig_index        int64
id                int64
name             object
city             object
rating          float64
rating_count      int32
cost              int32
cuisine          object
lic_no           object
link             object
address          object
menu             object
dtype: object

Descriptive stats (numeric):


Unnamed: 0,orig_index,id,rating,rating_count,cost
count,148442.0,148442.0,148442.0,148442.0,148442.0
mean,74255.288719,363514.675139,3.956326,2521.72431,424.933072
std,42883.699548,167882.864986,0.300492,1429.586927,159.095948
min,0.0,211.0,1.0,50.0,150.0
25%,37117.25,233524.5,4.0,1286.0,287.0
50%,74242.5,412706.5,4.0,2521.0,425.0
75%,111395.75,502250.5,4.0,3763.0,563.0
max,148540.0,581031.0,5.0,4999.0,700.0
