Preprocessing of zomato-bengaluru dataset

# Importing Libraries


In [4]:
import pandas as pd
import numpy as np

In [6]:
data = pd.read_csv("C:/Users/USER/Desktop/zomato.csv")

In [8]:
drop_col = [
    'url','address', 'phone', 'dish_liked', 'menu_item', 'reviews_list', 'listed_in(city)',
    'restaurant_id', 'name']

In [12]:
# dropping irrelevant or mostly missing columns
data.drop(columns = drop_col,axis = 1,errors = 'ignore',inplace = True)

In [14]:
# Renaming columns for clarity
data.rename(columns = {'approx_cost(for two people)': 'cost_for_two',
    'listed_in(type)': 'type'},inplace = True)

In [22]:
#Handling missing values
data.dropna(subset = ['rate','location','cuisines','cost_for_two'],inplace = True)

In [24]:
# Drop duplicates
data.drop_duplicates(inplace = True)


In [26]:
# clean rate column
data['rate'] = data['rate'].replace(['NEW', '-'], np.nan)
data['rate'] = data['rate'].str.replace('/5', '', regex=False)
data['rate'] = data['rate'].astype(float)

In [30]:
# Clean "cost_for_two" column (remove commas and convert to float)
data['cost_for_two'] = data['cost_for_two'].astype(str).str.replace(',', '')
data['cost_for_two'] = pd.to_numeric(data['cost_for_two'], errors='coerce')

In [32]:
# Drop rows with missing values in important columns
data.dropna(subset=['rate', 'cost_for_two'], inplace=True)

In [34]:
# Normalize categorical text (lowercase & strip)
data['location'] = data['location'].str.strip().str.lower()
data['rest_type'] = data['rest_type'].astype(str).str.strip().str.lower()
data['cuisines'] = data['cuisines'].astype(str).str.strip().str.lower()
data['type'] = data['type'].astype(str).str.strip().str.lower()
data['online_order'] = data['online_order'].map({'Yes': 1, 'No': 0})
data['book_table'] = data['book_table'].map({'Yes': 1, 'No': 0})

In [38]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

label_enc_cols = ['location', 'rest_type', 'cuisines', 'type']
le = LabelEncoder()
for col in label_enc_cols:
    data[col] = le.fit_transform(data[col])

In [40]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data['cost_scaled'] = scaler.fit_transform(data[['cost_for_two']])

# Final cleaned dataframe
data_cleaned = data.copy()

In [42]:
data

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost_for_two,type,cost_scaled
0,1,1,4.1,775,0,24,1901,800,0,0.346869
1,1,0,4.1,787,0,24,817,800,0,0.346869
2,1,0,3.8,918,0,19,654,800,0,0.346869
3,0,0,3.7,88,0,74,2229,300,0,-0.694228
4,0,0,3.8,166,3,24,1928,600,0,-0.069570
...,...,...,...,...,...,...,...,...,...,...
51709,0,0,3.7,34,88,25,1792,800,6,0.346869
51711,0,0,2.5,81,88,25,101,800,6,0.346869
51712,0,0,3.6,27,88,6,867,1500,6,1.804405
51715,0,1,4.3,236,28,6,1209,2500,6,3.886599


In [44]:
data.rename(columns = {'type':"Service_type"})

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost_for_two,Service_type,cost_scaled
0,1,1,4.1,775,0,24,1901,800,0,0.346869
1,1,0,4.1,787,0,24,817,800,0,0.346869
2,1,0,3.8,918,0,19,654,800,0,0.346869
3,0,0,3.7,88,0,74,2229,300,0,-0.694228
4,0,0,3.8,166,3,24,1928,600,0,-0.069570
...,...,...,...,...,...,...,...,...,...,...
51709,0,0,3.7,34,88,25,1792,800,6,0.346869
51711,0,0,2.5,81,88,25,101,800,6,0.346869
51712,0,0,3.6,27,88,6,867,1500,6,1.804405
51715,0,1,4.3,236,28,6,1209,2500,6,3.886599


## ML MODELLING

Train_Test Splitting of Data

In [50]:
from sklearn.model_selection import train_test_split
X = data.drop('rate',axis =1)
y = data['rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Using RandomForestRegressor for prediction

In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))


RMSE: 0.14780552373529715
R²: 0.8915331944887819


In [58]:
# Saving the model
import joblib
joblib.dump(model, 'models/restaurant_rating_model.pkl')


['models/restaurant_rating_model.pkl']