In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import joblib

# File paths
CLEAN = "cleaned_data.csv"
ENCODED = "encoded_data.csv"
ENCODER_FILE = "encoder.pkl"

In [4]:
# Load the cleaned dataset
df = pd.read_csv(CLEAN)

# Display dataset information
print("✅ Cleaned data loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

✅ Cleaned data loaded successfully!
Shape: (148442, 12)


Unnamed: 0,orig_index,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,0,567335,AB FOODS POINT,Abohar,4.0,4808,322,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,1,531342,Janta Sweet House,Abohar,4.4,3061,349,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,2,158203,theka coffee desi,Abohar,3.8,715,409,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,3,187912,Singh Hut,Abohar,3.7,613,225,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,4,543530,GRILL MASTERS,Abohar,4.0,3859,500,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [6]:
# Identify categorical and numeric columns for encoding
cat_features = []
if 'city' in df.columns:
    cat_features.append('city')
if 'cuisine' in df.columns:
    cat_features.append('cuisine')

num_features = [c for c in ['rating', 'rating_count', 'cost'] if c in df.columns]

print("Categorical features:", cat_features)
print("Numeric features:", num_features)

Categorical features: ['city', 'cuisine']
Numeric features: ['rating', 'rating_count', 'cost']


In [7]:
# Apply One-Hot Encoding to categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

cat_data = encoder.fit_transform(df[cat_features])

# Create a new DataFrame for encoded categorical columns
cat_cols = encoder.get_feature_names_out(cat_features).tolist()
df_cat = pd.DataFrame(cat_data, columns=cat_cols, index=df.index)

print(f"Encoded categorical columns: {len(cat_cols)}")
df_cat.head()

Encoded categorical columns: 2952


Unnamed: 0,"city_Abids & Koti,Hyderabad",city_Abohar,"city_Adajan,Surat",city_Adilabad,city_Adityapur,city_Adoni,"city_Adyar,Chennai",city_Agartala,city_Agra,city_Ahmednagar,...,"cuisine_Vietnamese,Snacks",cuisine_Waffle,"cuisine_Waffle,Bakery","cuisine_Waffle,Beverages","cuisine_Waffle,Burgers","cuisine_Waffle,Chinese","cuisine_Waffle,Desserts","cuisine_Waffle,Fast Food","cuisine_Waffle,Ice Cream","cuisine_Waffle,Snacks"
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Select numeric columns
df_num = df[num_features].reset_index(drop=True)
df_num.index = df.index

# Combine everything: original index, numeric, and categorical data
df_encoded = pd.concat([df[['orig_index']], df_num, df_cat], axis=1)

print("✅ Combined encoded DataFrame created successfully!")
print(f"Shape: {df_encoded.shape}")
df_encoded.head()

✅ Combined encoded DataFrame created successfully!
Shape: (148442, 2956)


Unnamed: 0,orig_index,rating,rating_count,cost,"city_Abids & Koti,Hyderabad",city_Abohar,"city_Adajan,Surat",city_Adilabad,city_Adityapur,city_Adoni,...,"cuisine_Vietnamese,Snacks",cuisine_Waffle,"cuisine_Waffle,Bakery","cuisine_Waffle,Beverages","cuisine_Waffle,Burgers","cuisine_Waffle,Chinese","cuisine_Waffle,Desserts","cuisine_Waffle,Fast Food","cuisine_Waffle,Ice Cream","cuisine_Waffle,Snacks"
0,0,4.0,4808,322,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,4.4,3061,349,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3.8,715,409,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3.7,613,225,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4.0,3859,500,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Save the encoded DataFrame and encoder
df_encoded.to_csv(ENCODED, index=False)
joblib.dump(encoder, ENCODER_FILE)

print(f"✅ Encoded data saved to {ENCODED}")
print(f"✅ Encoder object saved to {ENCODER_FILE}")

✅ Encoded data saved to encoded_data.csv
✅ Encoder object saved to encoder.pkl
