In [1]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Read the data
data = pd.read_csv('swiggy.csv')

In [3]:
data

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json
...,...,...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,--,Too Few Ratings,₹ 200,"Fast Food,Snacks",21522053000452,https://www.swiggy.com/restaurants/the-food-de...,"The Food Delight, 94MC+X35, New Singhania Naga...",Menu/553122.json
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,--,Too Few Ratings,₹ 300,Pizzas,license,https://www.swiggy.com/restaurants/maitri-food...,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",Menu/562647.json
148538,559435,Cafe Bella Ciao,Yavatmal,--,Too Few Ratings,₹ 300,"Fast Food,Snacks",21522251000378,https://www.swiggy.com/restaurants/cafe-bella-...,"Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",Menu/559435.json
148539,418989,GRILL ZILLA,Yavatmal,--,Too Few Ratings,₹ 250,Continental,21521251000241,https://www.swiggy.com/restaurants/grill-zilla...,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",Menu/418989.json


In [4]:
# check column names
data.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [5]:
#check data types
data.dtypes

id               int64
name            object
city            object
rating          object
rating_count    object
cost            object
cuisine         object
lic_no          object
link            object
address         object
menu            object
dtype: object

#### Data Understanding and Cleaning

In [6]:
#check Duplicate 
data.duplicated().sum()

np.int64(0)

In [7]:
# check Missing Values
data.isnull().sum()

id                0
name             86
city              0
rating           86
rating_count     86
cost            131
cuisine          99
lic_no          229
link              0
address          86
menu              0
dtype: int64

In [8]:
data[data['name'].isnull()]

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
31044,397961,,"Greater Mohali,Chandigarh",,,,,,https://www.swiggy.com/restaurants/5-tara-grea...,,Menu/397961.json
32912,308071,,"West Chd,Chandigarh",,,,,,https://www.swiggy.com/restaurants/food-under-...,,Menu/308071.json
33046,308662,,"West Chd,Chandigarh",,,,,,https://www.swiggy.com/restaurants/franks-frie...,,Menu/308662.json
33488,170889,,"South Chd,Chandigarh",,,,,,https://www.swiggy.com/restaurants/dhaba-7-pre...,,Menu/170889.json
36492,390394,,"George Town,Chennai",,,,,,https://www.swiggy.com/restaurants/frozen-cafe...,,Menu/390394.json
...,...,...,...,...,...,...,...,...,...,...,...
137122,256827,,Satara,,,,,,https://www.swiggy.com/restaurants/rajdhani-pa...,,Menu/256827.json
137546,567595,,Shillong,,,,,,https://www.swiggy.com/restaurants/the-blue-ri...,,Menu/567595.json
137614,116615,,Shivamogga,,,,,,https://www.swiggy.com/restaurants/bakers-bowl...,,Menu/116615.json
138380,401760,,Sirsa,,,,,,https://www.swiggy.com/restaurants/murga-dlf-p...,,Menu/401760.json


In [9]:
data = data.dropna(subset=['name']).reset_index(drop = True) #Drop na name rows

In [None]:
#148541 - 86

148455

In [11]:
data.shape 

(148455, 11)

In [12]:
# check Missing Values
data.isnull().sum()

id                0
name              0
city              0
rating            0
rating_count      0
cost             45
cuisine          13
lic_no          143
link              0
address           0
menu              0
dtype: int64

In [13]:
data[data['cuisine'].isnull()]

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
47761,155151,Pocket Plates By Abongchiiz,"GTB Nagar,Delhi",--,Too Few Ratings,₹ 300,,23318002000022,https://www.swiggy.com/restaurants/pocket-plat...,"Pocket Plates By Abongchiiz, SHOP NO-15 DDA MA...",Menu/155151.json
51691,245476,NEW YORK WAFFLES & DINGES,"Greater Kailash 2,Delhi",--,Too Few Ratings,₹ 150,,23319008000450,https://www.swiggy.com/restaurants/new-york-wa...,"NEW YORK WAFFLES & DINGES, A 6 KAILASH COLONY,...",Menu/245476.json
52211,62718,34 Chowringhee Lane,"Ashok Vihar,Delhi",3.6,100+ ratings,₹ 350,,13318005000537,https://www.swiggy.com/restaurants/34-chowring...,"34 Chowringhee Lane, Shop no. 2, Phase 1, J bl...",Menu/62718.json
52507,349965,TRP-Tandoor Roll Paratha,"South Extension,Delhi",--,Too Few Ratings,₹ 150,,23321002000062,https://www.swiggy.com/restaurants/trp-tandoor...,"TRP-Tandoor Roll Paratha, D-59, Panchsheel Enc...",Menu/349965.json
57156,319007,Kathi Roll Point,"Lajpat Nagar,Delhi",--,Too Few Ratings,₹ 200,,23321010000517,https://www.swiggy.com/restaurants/kathi-roll-...,"Kathi Roll Point, SHOP NO.42,NEHRU NAGAR NEW D...",Menu/319007.json
91121,129110,Veggiee Treat,"Chinar Park,Kolkata",4.4,50+ ratings,₹ 200,,22819013001075,https://www.swiggy.com/restaurants/veggiee-tre...,"Veggiee Treat, JAMUNA POINT, C3, GROUND FLOOR,...",Menu/129110.json
99542,173687,Dixit Chaat House,"Gomti Nagar,Lucknow",--,Too Few Ratings,₹ 200,,22719743000127,https://www.swiggy.com/restaurants/dixit-chaat...,"Dixit Chaat House, ONE AWADH MALL, VIBHUTI KHA...",Menu/173687.json
111103,310000,Banjo's The food chain,"Airoli,Mumbai",--,Too Few Ratings,₹ 300,,21520073000201,https://www.swiggy.com/restaurants/banjos-the-...,"Banjo's The food chain, MSD ENTERPRISES,SHOP ...",Menu/310000.json
113307,207931,Green Chilli,"Nandanvan,Nagpur",--,Too Few Ratings,₹ 250,,license,https://www.swiggy.com/restaurants/green-chill...,"Green Chilli, 5 hanuman nagar, krida road, abo...",Menu/207931.json
125119,322909,Sahil's Kitchen,"Kondhwa,Pune",--,Too Few Ratings,₹ 180,,21520167000725,https://www.swiggy.com/restaurants/sahils-kitc...,"Sahil's Kitchen, E-51, GROUND FLOOR, KAMALDEEP...",Menu/322909.json


In [None]:
data = data.dropna(subset=['cuisine']).reset_index(drop=True) #drop na cuisine rows

In [15]:
data.shape

(148442, 11)

In [16]:
data.isnull().sum()

id                0
name              0
city              0
rating            0
rating_count      0
cost             44
cuisine           0
lic_no          143
link              0
address           0
menu              0
dtype: int64

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148442 entries, 0 to 148441
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148442 non-null  int64 
 1   name          148442 non-null  object
 2   city          148442 non-null  object
 3   rating        148442 non-null  object
 4   rating_count  148442 non-null  object
 5   cost          148398 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148299 non-null  object
 8   link          148442 non-null  object
 9   address       148442 non-null  object
 10  menu          148442 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB


In [18]:
# Categorical: name, city, cuisine
# Numerical: rating, rating_count, cost

In [19]:
data['cost'].value_counts()

cost
₹ 200    38632
₹ 300    29699
₹ 250    19742
₹ 150    12094
₹ 400    11711
         ...  
₹ 216        1
₹ 88         1
₹ 101        1
₹ 312        1
₹ 64         1
Name: count, Length: 363, dtype: int64

In [20]:
data['cost'].unique()

array(['₹ 200', '₹ 100', '₹ 250', '₹ 150', '₹ 300', '₹ 700', '₹ 650',
       '₹ 400', '₹ 350', '₹ 60', '₹ 110', '₹ 399', '₹ 249', '₹ 500',
       '₹ 99', '₹ 120', '₹ 499', '₹ 299', '₹ 199', '₹ 50', '₹ 180',
       '₹ 349', '₹ 1000', '₹ 599', '₹ 600', '₹ 800', '₹ 450', '₹ 149',
       '₹ 290', '₹ 175', '₹ 125', '₹ 8', '₹ 375', '₹ 275', '₹ 425',
       '₹ 225', '₹ 325', '₹ 75', '₹ 160', '₹ 550', '₹ 220', '₹ 20',
       '₹ 59', '₹ 1200', '₹ 210', '₹ 30', '₹ 310', '₹ 70', '₹ 170',
       '₹ 449', '₹ 280', '₹ 320', '₹ 1300', '₹ 850', '₹ 900', '₹ 40',
       '₹ 1500', '₹ 140', '₹ 1100', '₹ 410', '₹ 80', '₹ 10', '₹ 1245',
       '₹ 510', '₹ 90', '₹ 260', '₹ 1800', '₹ 5', '₹ 240', '₹ 460',
       '₹ 1900', '₹ 352', '₹ 298', '₹ 2', '₹ 252', '₹ 330', '₹ 750',
       '₹ 130', '₹ 2000', '₹ 198', '₹ 230', '₹ 999', '₹ 3999', nan,
       '₹ 235', '₹ 1600', '₹ 55', '₹ 179', '₹ 129', '₹ 360', '₹ 85',
       '₹ 248', '₹ 270', '₹ 25', '₹ 159', '₹ 370', '₹ 1050', '₹ 49',
       '₹ 699', '₹ 340', '₹ 190', 

In [21]:
#Impute with missing values 'cost'
#Remove the special character 
data['cost'] = data['cost'].str.replace('₹', ' ', regex=True).str.strip()

In [22]:
data['cost'].isna().sum()

np.int64(44)

In [23]:
#convert object to int
data['cost'] = data['cost'].fillna(0).astype(int)

In [24]:
data['cost'].isnull().sum()

np.int64(0)

In [25]:
data['cuisine'].value_counts()

cuisine
North Indian,Chinese    6471
Indian                  6414
Chinese                 5051
North Indian            4775
Indian,Chinese          4374
                        ... 
Biryani,Asian              1
Kerala,Home Food           1
Continental,Nepalese       1
Snacks,Jain                1
Juices,Sweets              1
Name: count, Length: 2132, dtype: int64

In [26]:
data[data['cost']== 0 ]

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
4966,132295,Chinese Hut,Akola,3.7,50+ ratings,0,Chinese,21518145000592,https://www.swiggy.com/restaurants/chinese-hut...,"Chinese Hut, Sindhi Colony, Akola, Maharashtra...",Menu/132295.json
4983,134728,Zamzam Family Restaurant,Akola,3.2,50+ ratings,0,"Indian,Biryani",21519247000522,https://www.swiggy.com/restaurants/zamzam-fami...,"Zamzam Family Restaurant, Subhash Chowk,Red Cr...",Menu/134728.json
24835,131745,Anvita Veg Restaurant,Beed,3.7,100+ ratings,0,"North Indian,South Indian",11514047000911,https://www.swiggy.com/restaurants/anvita-veg-...,"Anvita Veg Restaurant, Jalna road Beed",Menu/131745.json
24836,132958,Bansi Pav bhaji,Beed,3.6,100+ ratings,0,Indian,21519231000433,https://www.swiggy.com/restaurants/bansi-pav-b...,"Bansi Pav bhaji, Sathe Chowk",Menu/132958.json
24838,131755,Sai Prasad Family Restaurant,Beed,3.8,20+ ratings,0,"Chinese,North Indian",11519047000631,https://www.swiggy.com/restaurants/sai-prasad-...,"Sai Prasad Family Restaurant, Mane Complex",Menu/131755.json
24839,131913,Taj Chicken Chinese,Beed,3.2,20+ ratings,0,Chinese,21516231000666,https://www.swiggy.com/restaurants/taj-chicken...,"Taj Chicken Chinese, Basir Gang , Beed",Menu/131913.json
24840,133551,Hotel Maithili,Beed,3.8,100+ ratings,0,"North Indian,Punjabi",11520147000042,https://www.swiggy.com/restaurants/hotel-maith...,"Hotel Maithili, D.P Road, Beed",Menu/133551.json
24846,132123,Copisa,Beed,3.2,20+ ratings,0,"Pizzas,Italian",21516231000093,https://www.swiggy.com/restaurants/copisa-city...,"Copisa, Sarda Capital",Menu/132123.json
24848,131919,Taj Biryani House,Beed,2.5,20+ ratings,0,"Biryani,North Indian",21516231000666,https://www.swiggy.com/restaurants/taj-biryani...,"Taj Biryani House, Basir Gang , Beed",Menu/131919.json
24864,131737,Shantiraj Bhel Pakodi Center,Beed,--,Too Few Ratings,0,Chaat,21519231000450,https://www.swiggy.com/restaurants/shantiraj-b...,"Shantiraj Bhel Pakodi Center, Rajiv Gandhi Cho...",Menu/131737.json


In [27]:
data[data['cost']== 0 ].shape

(44, 11)

In [29]:
#fill the cost value depends on unique cosine

In [30]:
# calculate the mean cost for each category of cuisine to (impute it cost 0 )
mean_values = data.groupby('cuisine')['cost'].mean().astype(int).to_dict()

In [39]:
mean_values

{'8:15 To 11:30 Pm': 194,
 'Afghani': 308,
 'Afghani,American': 250,
 'Afghani,Arabian': 330,
 'Afghani,Bangladeshi': 150,
 'Afghani,Barbecue': 300,
 'Afghani,Bengali': 500,
 'Afghani,Biryani': 255,
 'Afghani,Chinese': 262,
 'Afghani,Combo': 200,
 'Afghani,European': 200,
 'Afghani,Fast Food': 300,
 'Afghani,Grill': 250,
 'Afghani,Hyderabadi': 200,
 'Afghani,Indian': 270,
 'Afghani,Italian': 250,
 'Afghani,Mughlai': 300,
 'Afghani,North Indian': 249,
 'Afghani,South Indian': 199,
 'Afghani,Tandoor': 283,
 'African,Bakery': 250,
 'African,Continental': 250,
 'African,Indian': 150,
 'American': 321,
 'American,Andhra': 450,
 'American,Arabian': 321,
 'American,Asian': 658,
 'American,Assamese': 225,
 'American,Bakery': 278,
 'American,Barbecue': 443,
 'American,Bengali': 300,
 'American,Beverages': 280,
 'American,Biryani': 282,
 'American,British': 200,
 'American,Burgers': 258,
 'American,Cafe': 383,
 'American,Chaat': 200,
 'American,Chinese': 245,
 'American,Combo': 191,
 'American,C

In [None]:
#replace all 0 cost values with the average cost for that cuisine
#data.loc[row_selection, column_selection]


In [40]:
#impute the value
data.loc[data['cost'] == 0, 'cost'] = data['cuisine'].map(mean_values)

In [41]:
#convert object to numeric
data['cost'] = pd.to_numeric(data['cost'], errors = 'coerce')

In [42]:
data['cost'].value_counts()

cost
200     38632
300     29699
250     19742
150     12094
400     11711
        ...  
271         1
2700        1
3100        1
333         1
276         1
Name: count, Length: 373, dtype: int64

In [43]:
data['cost'].isnull().sum()

np.int64(0)

In [44]:
data['rating_count'].value_counts()

rating_count
Too Few Ratings    87005
20+ ratings        21634
100+ ratings       20547
50+ ratings        12008
500+ ratings        4396
1K+ ratings         2739
5K+ ratings           98
10K+ ratings          15
Name: count, dtype: int64

In [45]:
data['rating_count'] = data['rating_count'].replace('Too Few Ratings', 10)

In [46]:
data['rating_count'] = data['rating_count'].replace('20+ ratings', 35)

In [47]:
data['rating_count']

0                   10
1          50+ ratings
2         100+ ratings
3                   35
4                   10
              ...     
148437              10
148438              10
148439              10
148440              10
148441              10
Name: rating_count, Length: 148442, dtype: object

In [48]:
data['rating_count'] = data['rating_count'].replace('50+ ratings', 75)

In [49]:
data['rating_count'] = data['rating_count'].replace('100+ ratings', 250)

In [50]:
data['rating_count'] = data['rating_count'].replace('500+ ratings', 750)

In [51]:
data['rating_count'] = data['rating_count'].replace('1K+ ratings', 2500)

In [52]:
data['rating_count'] = data['rating_count'].replace('5K+ ratings', 7500)

In [53]:
data['rating_count'] = data['rating_count'].replace('10K+ ratings', 15000)

  data['rating_count'] = data['rating_count'].replace('10K+ ratings', 15000)


In [54]:
data['rating_count'].value_counts()

rating_count
10       87005
35       21634
250      20547
75       12008
750       4396
2500      2739
7500        98
15000       15
Name: count, dtype: int64

In [55]:
data['rating_count'] = pd.to_numeric(data['rating_count'], errors = 'coerce')

In [56]:
data['rating'].value_counts()

rating
--     87005
4.0     6532
4.1     6296
4.2     5821
3.8     5736
3.9     5435
4.3     5011
3.7     4252
4.4     3148
3.5     2963
3.6     2924
3.4     1879
3.3     1801
4.5     1778
4.6     1334
3.2     1202
3.0      859
3.1      791
4.7      648
2.8      473
2.9      471
4.8      338
2.7      325
2.5      221
5.0      209
2.6      208
4.9      174
2.3      128
2.4      118
2.2       89
2.0       72
2.1       61
1.9       46
1.8       27
1.5       18
1.6       17
1.7       13
1.4       10
1.2        4
1.3        3
1.1        1
1.0        1
Name: count, dtype: int64

In [57]:
data[data['rating'] == '--']

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,10,200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
4,543530,GRILL MASTERS,Abohar,--,10,250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json
7,244866,Shri Balaji Vaishno Dhaba,Abohar,--,10,100,North Indian,22119652000389,https://www.swiggy.com/restaurants/shri-balaji...,"Shri Balaji Vaishno Dhaba, St no 13,6th chowk,...",Menu/244866.json
9,158193,yummy hub,Abohar,--,10,200,Indian,22119652000045,https://www.swiggy.com/restaurants/yummy-hub-c...,"yummy hub, hanumangarh road near dr naveen set...",Menu/158193.json
10,407249,CHAWLA SAAB THE JUICE MASTER,Abohar,--,10,300,"Juices,Beverages",22121652000374,https://www.swiggy.com/restaurants/chawla-saab...,"CHAWLA SAAB THE JUICE MASTER, SAHITYA SADAN MA...",Menu/407249.json
...,...,...,...,...,...,...,...,...,...,...,...
148437,553122,The Food Delight,Yavatmal,--,10,200,"Fast Food,Snacks",21522053000452,https://www.swiggy.com/restaurants/the-food-de...,"The Food Delight, 94MC+X35, New Singhania Naga...",Menu/553122.json
148438,562647,MAITRI FOODS & BEVERAGES,Yavatmal,--,10,300,Pizzas,license,https://www.swiggy.com/restaurants/maitri-food...,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",Menu/562647.json
148439,559435,Cafe Bella Ciao,Yavatmal,--,10,300,"Fast Food,Snacks",21522251000378,https://www.swiggy.com/restaurants/cafe-bella-...,"Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",Menu/559435.json
148440,418989,GRILL ZILLA,Yavatmal,--,10,250,Continental,21521251000241,https://www.swiggy.com/restaurants/grill-zilla...,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",Menu/418989.json


In [58]:
data['rating'] = pd.to_numeric(data['rating'], errors= 'coerce')

In [60]:
data['rating'].isnull().sum()

np.int64(87005)

In [61]:
median_values = data['rating'].median()

In [63]:
median_values

np.float64(4.0)

In [64]:
data['rating'] = data['rating'].fillna(median_values)

In [65]:
data['rating'].value_counts()

rating
4.0    93537
4.1     6296
4.2     5821
3.8     5736
3.9     5435
4.3     5011
3.7     4252
4.4     3148
3.5     2963
3.6     2924
3.4     1879
3.3     1801
4.5     1778
4.6     1334
3.2     1202
3.0      859
3.1      791
4.7      648
2.8      473
2.9      471
4.8      338
2.7      325
2.5      221
5.0      209
2.6      208
4.9      174
2.3      128
2.4      118
2.2       89
2.0       72
2.1       61
1.9       46
1.8       27
1.5       18
1.6       17
1.7       13
1.4       10
1.2        4
1.3        3
1.1        1
1.0        1
Name: count, dtype: int64

In [66]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148442 entries, 0 to 148441
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            148442 non-null  int64  
 1   name          148442 non-null  object 
 2   city          148442 non-null  object 
 3   rating        148442 non-null  float64
 4   rating_count  148442 non-null  int64  
 5   cost          148442 non-null  int64  
 6   cuisine       148442 non-null  object 
 7   lic_no        148299 non-null  object 
 8   link          148442 non-null  object 
 9   address       148442 non-null  object 
 10  menu          148442 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 12.5+ MB


In [67]:
data.isnull().sum()

id                0
name              0
city              0
rating            0
rating_count      0
cost              0
cuisine           0
lic_no          143
link              0
address           0
menu              0
dtype: int64

In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148442 entries, 0 to 148441
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            148442 non-null  int64  
 1   name          148442 non-null  object 
 2   city          148442 non-null  object 
 3   rating        148442 non-null  float64
 4   rating_count  148442 non-null  int64  
 5   cost          148442 non-null  int64  
 6   cuisine       148442 non-null  object 
 7   lic_no        148299 non-null  object 
 8   link          148442 non-null  object 
 9   address       148442 non-null  object 
 10  menu          148442 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 12.5+ MB


In [69]:
data[data['lic_no'].isnull()]

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
795,301904,SHAKE IT BABY,Agra,4.0,10,200,"Italian,Beverages",,https://www.swiggy.com/restaurants/shake-it-ba...,"SHAKE IT BABY, M/S SHAKE IT BABY SHOP NO-6 SUR...",Menu/301904.json
1299,358634,Meal Factory - By The Healthy Stove,"Vastrapur,Ahmedabad",4.0,10,300,Indian,,https://www.swiggy.com/restaurants/meal-factor...,"Meal Factory - By The Healthy Stove, 'The Heal...",Menu/358634.json
1303,77532,Shiv Shakti Kathiyawadi,"Vastrapur,Ahmedabad",4.1,250,300,Indian,,https://www.swiggy.com/restaurants/shiv-shakti...,"Shiv Shakti Kathiyawadi, 23-24, Ground Floor, ...",Menu/77532.json
4242,93703,Tomato'S,"Chandkheda,Ahmedabad",4.3,250,800,"Mexican,North Indian",,https://www.swiggy.com/restaurants/tomatos-mot...,"Tomato'S, Ground Floor, Shop No. 1, Dwarkesh B...",Menu/93703.json
9893,3306,Taste of Darjeeling,"Koramangala,Bangalore",4.0,10,300,Chinese,,https://www.swiggy.com/restaurants/taste-of-da...,"Taste of Darjeeling, H-14, 1st Main, 80 Feet R...",Menu/3306.json
...,...,...,...,...,...,...,...,...,...,...,...
143658,302775,The Fullmoon Cafe & Restro,Udaipur,2.6,35,150,Indian,,https://www.swiggy.com/restaurants/the-fullmoo...,"The Fullmoon Cafe & Restro, 6 ka 16 Ram Singh ...",Menu/302775.json
145484,82562,Swagath Dine INN,Vellore,3.7,250,250,"Biryani,North Indian",,https://www.swiggy.com/restaurants/swagath-din...,"Swagath Dine INN, Opp to VIT,VIT Main Road, Ve...",Menu/82562.json
146982,65981,Jagadhamba Fast Foods,"N A D,Vizag",3.5,35,200,"Chinese,Biryani",,https://www.swiggy.com/restaurants/jagadhamba-...,"Jagadhamba Fast Foods, Opposite Kamala Nursing...",Menu/65981.json
147030,246351,WRAP IT - Eat Yummmm!,"Gajuwaka,Vizag",4.1,750,190,"Mexican,Fast Food",,https://www.swiggy.com/restaurants/wrap-it-eat...,"WRAP IT - Eat Yummmm!, Door 16-246, Kitchen 1,...",Menu/246351.json


In [70]:
data['lic_no']= data['lic_no'].fillna(0)

In [71]:
data['lic_no'].isnull().sum()

np.int64(0)

In [72]:
data[data['lic_no'].isnull()]

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu


In [73]:
data.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [74]:
data['city'].value_counts()

city
Bikaner                 1666
Noida-1                 1427
Indirapuram,Delhi       1279
BTM,Bangalore           1161
Rohini,Delhi            1135
                        ... 
Alwarpet,Chennai           1
BBK_Chattarpur,Delhi       1
BBK_MayurVihar,Delhi       1
Hampi                      1
Manali                     1
Name: count, Length: 821, dtype: int64

In [75]:
data.shape

(148442, 11)

In [76]:
data.isnull().sum()

id              0
name            0
city            0
rating          0
rating_count    0
cost            0
cuisine         0
lic_no          0
link            0
address         0
menu            0
dtype: int64

#### cleaned_csv data

In [77]:
data.to_csv('cleaned_data.csv',index = False)

In [78]:
data.describe()

Unnamed: 0,id,rating,rating_count,cost
count,148442.0,148442.0,148442.0,148442.0
mean,363514.675139,3.956326,126.440563,287.598604
std,167882.864986,0.300492,430.360336,796.645176
min,211.0,1.0,10.0,1.0
25%,233524.5,4.0,10.0,200.0
50%,412706.5,4.0,10.0,250.0
75%,502250.5,4.0,75.0,300.0
max,581031.0,5.0,15000.0,300350.0


In [79]:
#data for encoding 
data_1 = data.drop(columns=['name', 'lic_no', 'link', 'address', 'menu'])

In [80]:
data_1

Unnamed: 0,id,city,rating,rating_count,cost,cuisine
0,567335,Abohar,4.0,10,200,"Beverages,Pizzas"
1,531342,Abohar,4.4,75,200,"Sweets,Bakery"
2,158203,Abohar,3.8,250,100,Beverages
3,187912,Abohar,3.7,35,250,"Fast Food,Indian"
4,543530,Abohar,4.0,10,250,"Italian-American,Fast Food"
...,...,...,...,...,...,...
148437,553122,Yavatmal,4.0,10,200,"Fast Food,Snacks"
148438,562647,Yavatmal,4.0,10,300,Pizzas
148439,559435,Yavatmal,4.0,10,300,"Fast Food,Snacks"
148440,418989,Yavatmal,4.0,10,250,Continental


In [81]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148442 entries, 0 to 148441
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            148442 non-null  int64  
 1   city          148442 non-null  object 
 2   rating        148442 non-null  float64
 3   rating_count  148442 non-null  int64  
 4   cost          148442 non-null  int64  
 5   cuisine       148442 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 6.8+ MB


In [82]:
data_1['city'].unique()

array(['Abohar', 'Adilabad', 'Adityapur', 'Adoni', 'Agartala', 'Agra',
       'Vastrapur,Ahmedabad', 'GOTA,Ahmedabad',
       'Paldi & Ambawadi,Ahmedabad', 'Ghatlodia,Ahmedabad',
       'Bopal,Ahmedabad', 'Gandhinagar,Ahmedabad', 'LalDarwaja,Ahmedabad',
       'Naranpura,Ahmedabad', 'Navrangpura,Ahmedabad',
       'Science City,Ahmedabad', 'Maninagar,Ahmedabad',
       'Chandkheda,Ahmedabad', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola',
       'Alappuzha', 'Aligarh', 'Alipurduar', 'Allahabad', 'Alwar',
       'Ambala', 'Ambikapur', 'Ambur', 'Amravati', 'Amreli', 'Amritsar',
       'Anand', 'Anantapur', 'Ankleshwar', 'Arakkonam', 'Arambagh',
       'Arrah', 'Aruppukottai', 'Asansol', 'Aurangabad',
       'Aurangabad_bihar', 'Azamgarh', 'Baddi', 'Bagalkot', 'Bagdogra',
       'Bahadurgarh', 'Bahraich', 'Balaghat', 'Balangir', 'Balasore',
       'Ballari', 'Balrampur', 'Balurghat', 'Banda',
       'Yeshwanthpur,Bangalore', 'Geddalahalli,Bangalore',
       'Koramangala,Bangalore', 'JP Nagar,B

In [83]:
data_1[data_1['city'].str.count(',') == 0] 

Unnamed: 0,id,city,rating,rating_count,cost,cuisine
0,567335,Abohar,4.0,10,200,"Beverages,Pizzas"
1,531342,Abohar,4.4,75,200,"Sweets,Bakery"
2,158203,Abohar,3.8,250,100,Beverages
3,187912,Abohar,3.7,35,250,"Fast Food,Indian"
4,543530,Abohar,4.0,10,250,"Italian-American,Fast Food"
...,...,...,...,...,...,...
148437,553122,Yavatmal,4.0,10,200,"Fast Food,Snacks"
148438,562647,Yavatmal,4.0,10,300,Pizzas
148439,559435,Yavatmal,4.0,10,300,"Fast Food,Snacks"
148440,418989,Yavatmal,4.0,10,250,Continental


In [84]:
data_1[data_1['city'].str.count(',') == 0].city.nunique()

527

In [85]:
data_1[data_1['city'].str.count(',') == 0].city.unique()

array(['Abohar', 'Adilabad', 'Adityapur', 'Adoni', 'Agartala', 'Agra',
       'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola', 'Alappuzha', 'Aligarh',
       'Alipurduar', 'Allahabad', 'Alwar', 'Ambala', 'Ambikapur', 'Ambur',
       'Amravati', 'Amreli', 'Amritsar', 'Anand', 'Anantapur',
       'Ankleshwar', 'Arakkonam', 'Arambagh', 'Arrah', 'Aruppukottai',
       'Asansol', 'Aurangabad', 'Aurangabad_bihar', 'Azamgarh', 'Baddi',
       'Bagalkot', 'Bagdogra', 'Bahadurgarh', 'Bahraich', 'Balaghat',
       'Balangir', 'Balasore', 'Ballari', 'Balrampur', 'Balurghat',
       'Banda', 'Bantwal', 'Bapatlachirala', 'Baramati', 'Baran',
       'Bardhaman', 'Bardoli', 'Bareilly', 'Barmer', 'Barnala', 'Barshi',
       'Barwani', 'Basirhat', 'Basti', 'Batala', 'Bathinda', 'Beawar',
       'Beed', 'Begusarai', 'Bela-pratapgarh', 'Belgaum', 'Berhampore',
       'Berhampur', 'Bettiah', 'Betul', 'Bhadohi', 'Bhadrachalam',
       'Bhagalpur', 'Bhandara', 'Bharabanki', 'Bharatpur', 'Bharuch',
       'Bhatkal'

In [86]:
data_1[data_1['city'].str.count(',') == 1]

Unnamed: 0,id,city,rating,rating_count,cost,cuisine
1100,40184,"Vastrapur,Ahmedabad",4.3,250,1200,"Indian,Chinese"
1101,45635,"Vastrapur,Ahmedabad",4.0,250,150,Fast Food
1102,483947,"Vastrapur,Ahmedabad",4.0,10,300,"Chinese,Fast Food"
1103,483946,"Vastrapur,Ahmedabad",4.0,10,300,Chinese
1104,181107,"Vastrapur,Ahmedabad",4.0,10,350,Desserts
...,...,...,...,...,...,...
147919,510818,"Dwarka Nagar,Vizag",4.0,10,200,"North Indian,Chinese"
147920,554832,"Dwarka Nagar,Vizag",4.1,35,250,"Sweets,Snacks"
147921,365893,"Dwarka Nagar,Vizag",4.0,250,250,"Beverages,Desserts"
147922,456123,"Dwarka Nagar,Vizag",3.4,35,300,"Biryani,North Indian"


In [87]:
data_1[data_1['city'].str.count(',') == 1].city.nunique()

292

In [88]:
data_1[data_1['city'].str.count(',') == 1].city.unique()

array(['Vastrapur,Ahmedabad', 'GOTA,Ahmedabad',
       'Paldi & Ambawadi,Ahmedabad', 'Ghatlodia,Ahmedabad',
       'Bopal,Ahmedabad', 'Gandhinagar,Ahmedabad', 'LalDarwaja,Ahmedabad',
       'Naranpura,Ahmedabad', 'Navrangpura,Ahmedabad',
       'Science City,Ahmedabad', 'Maninagar,Ahmedabad',
       'Chandkheda,Ahmedabad', 'Yeshwanthpur,Bangalore',
       'Geddalahalli,Bangalore', 'Koramangala,Bangalore',
       'JP Nagar,Bangalore', 'Mahadevpura,Bangalore', 'HSR,Bangalore',
       'Arekere,Bangalore', 'Indiranagar,Bangalore',
       'Banashankari,Bangalore', 'Whitefield,Bangalore',
       'Nagavara & Hennur,Bangalore',
       'Kammanahalli/Kalyan Nagar,Bangalore',
       'Kumaraswamy Layout & Uttarahalli,Bangalore', 'BTM,Bangalore',
       'Battarahalli,Bangalore', 'Basaveshwaranagar,Bangalore',
       'Frazer Town,Bangalore', 'Yelahanka,Bangalore',
       'Majestic,Bangalore', 'Kanakapura Road,Bangalore',
       'R.T. Nagar,Bangalore', 'Kadugodi,Bangalore',
       'Marathahalli,Banga

In [89]:
data_1[data_1['city'].str.count(',') == 2]

Unnamed: 0,id,city,rating,rating_count,cost,cuisine
17917,515500,"Sanjay Nagar, New BEL Road,Bangalore",2.1,75,300,"Biryani,Chinese"
17918,111118,"Sanjay Nagar, New BEL Road,Bangalore",3.8,250,300,South Indian
17919,376689,"Sanjay Nagar, New BEL Road,Bangalore",3.6,35,250,"North Indian,Chinese"
17920,480671,"Sanjay Nagar, New BEL Road,Bangalore",4.0,10,150,"Desserts,Beverages"
17921,435743,"Sanjay Nagar, New BEL Road,Bangalore",4.2,250,400,Pizzas
...,...,...,...,...,...,...
78726,471988,"Tarnaka, Nacharam & Malkajigiri,Hyderabad",4.0,10,150,Chinese
78727,424315,"Tarnaka, Nacharam & Malkajigiri,Hyderabad",4.0,10,150,"Lebanese,Snacks"
78728,423468,"Tarnaka, Nacharam & Malkajigiri,Hyderabad",4.0,10,200,Chinese
78729,529544,"Tarnaka, Nacharam & Malkajigiri,Hyderabad",4.0,10,299,"Indian,Snacks"


In [90]:
sub_city = data_1[data_1['city'].str.count(',') == 2]

In [91]:
sub_city['city'].unique()

array(['Sanjay Nagar, New BEL Road,Bangalore',
       'Tarnaka, Nacharam & Malkajigiri,Hyderabad'], dtype=object)

In [92]:
sub_city['city']

17917         Sanjay Nagar, New BEL Road,Bangalore
17918         Sanjay Nagar, New BEL Road,Bangalore
17919         Sanjay Nagar, New BEL Road,Bangalore
17920         Sanjay Nagar, New BEL Road,Bangalore
17921         Sanjay Nagar, New BEL Road,Bangalore
                           ...                    
78726    Tarnaka, Nacharam & Malkajigiri,Hyderabad
78727    Tarnaka, Nacharam & Malkajigiri,Hyderabad
78728    Tarnaka, Nacharam & Malkajigiri,Hyderabad
78729    Tarnaka, Nacharam & Malkajigiri,Hyderabad
78730    Tarnaka, Nacharam & Malkajigiri,Hyderabad
Name: city, Length: 927, dtype: object

In [93]:
data_1[data_1['city'] == 'Yavatmal']

Unnamed: 0,id,city,rating,rating_count,cost,cuisine
148376,141318,Yavatmal,3.8,250,200,"North Indian,Fast Food"
148377,193866,Yavatmal,4.0,75,200,"North Indian,Maharashtrian"
148378,234395,Yavatmal,3.5,75,100,"American,Chinese"
148379,513588,Yavatmal,4.0,10,200,Chinese
148380,141314,Yavatmal,3.8,75,200,North Indian
...,...,...,...,...,...,...
148437,553122,Yavatmal,4.0,10,200,"Fast Food,Snacks"
148438,562647,Yavatmal,4.0,10,300,Pizzas
148439,559435,Yavatmal,4.0,10,300,"Fast Food,Snacks"
148440,418989,Yavatmal,4.0,10,250,Continental


In [94]:
# create a function for city , main_city
def split_city_data(row):
    parts = [p.strip() for p in row.split(',') if p.strip()]  # remove extra spaces and empty parts
    
    if len(parts) == 1:
        return [parts[0], '0']  # like "Abohar"
    else:
        city = ', '.join(parts[:-1])  # join all parts except last
        main_city = parts[-1]         # last part is main city
        return [city, main_city]


In [95]:
data_1[['city', 'main_city']] = data_1['city'].apply(split_city_data).apply(pd.Series)

In [99]:
data_1.head()

Unnamed: 0,id,city,rating,rating_count,cost,cuisine,main_city
0,567335,Abohar,4.0,10,200,"Beverages,Pizzas",0
1,531342,Abohar,4.4,75,200,"Sweets,Bakery",0
2,158203,Abohar,3.8,250,100,Beverages,0
3,187912,Abohar,3.7,35,250,"Fast Food,Indian",0
4,543530,Abohar,4.0,10,250,"Italian-American,Fast Food",0


In [100]:
data_1[data_1['cuisine'].str.count(',')==1]

Unnamed: 0,id,city,rating,rating_count,cost,cuisine,main_city
0,567335,Abohar,4.0,10,200,"Beverages,Pizzas",0
1,531342,Abohar,4.4,75,200,"Sweets,Bakery",0
3,187912,Abohar,3.7,35,250,"Fast Food,Indian",0
4,543530,Abohar,4.0,10,250,"Italian-American,Fast Food",0
8,156602,Abohar,4.2,35,100,"Snacks,Chaat",0
...,...,...,...,...,...,...,...
148433,529034,Yavatmal,4.0,10,300,"Snacks,Biryani",0
148435,561381,Yavatmal,4.0,10,100,"Chinese,Fast Food",0
148436,214210,Yavatmal,4.0,10,300,"Biryani,Maharashtrian",0
148437,553122,Yavatmal,4.0,10,200,"Fast Food,Snacks",0


In [101]:
data_1[data_1['cuisine'].str.count(',')==0]

Unnamed: 0,id,city,rating,rating_count,cost,cuisine,main_city
2,158203,Abohar,3.8,250,100,Beverages,0
5,158204,Abohar,3.6,35,200,Continental,0
6,156588,Abohar,4.0,250,150,North Indian,0
7,244866,Abohar,4.0,10,100,North Indian,0
9,158193,Abohar,4.0,10,200,Indian,0
...,...,...,...,...,...,...,...
148431,257017,Yavatmal,4.0,10,200,North Indian,0
148434,185296,Yavatmal,4.0,10,250,North Indian,0
148438,562647,Yavatmal,4.0,10,300,Pizzas,0
148440,418989,Yavatmal,4.0,10,250,Continental,0


In [102]:
def split_cuisine_data(row):
    parts = row.split(',')
    if len(parts) == 1: #like Beverages
        return [parts[0], '0']
    elif len(parts) == 2:
        return[parts[0], parts[1]]

In [103]:
data_1[['cuisine_1', 'cuisine_2']] = data_1['cuisine'].apply(split_cuisine_data).apply(pd.Series)

In [104]:
data_1 = data_1.drop(columns=('cuisine'))

In [105]:
data_1.head()

Unnamed: 0,id,city,rating,rating_count,cost,main_city,cuisine_1,cuisine_2
0,567335,Abohar,4.0,10,200,0,Beverages,Pizzas
1,531342,Abohar,4.4,75,200,0,Sweets,Bakery
2,158203,Abohar,3.8,250,100,0,Beverages,0
3,187912,Abohar,3.7,35,250,0,Fast Food,Indian
4,543530,Abohar,4.0,10,250,0,Italian-American,Fast Food


In [106]:
data_1['cuisine_1'].unique()

array(['Beverages', 'Sweets', 'Fast Food', 'Italian-American',
       'Continental', 'North Indian', 'Snacks', 'Indian', 'Juices',
       'Tandoor', 'Punjabi', 'Mughlai', 'Pizzas', 'Chinese', 'Ice Cream',
       'Chaat', 'Bakery', 'American', 'European', 'Biryani',
       'South Indian', 'Desserts', 'Street Food', 'Nepalese', 'Paan',
       'Healthy Food', 'Bengali', 'Thalis', 'Waffle', 'Arabian', 'Combo',
       'Tibetan', 'Burgers', 'Bihari', 'Italian', 'Salads', 'Kebabs',
       'Asian', 'Thai', 'North Eastern', 'Cafe', 'Pan-Asian', 'Lebanese',
       'Maharashtrian', 'Mexican', 'Pastas', 'Rajasthani', 'Gujarati',
       'Seafood', 'Japanese', 'Sushi', 'Middle Eastern', 'French',
       'Hyderabadi', 'Grill', 'SVANidhi Street Food Vendor', 'Afghani',
       'Oriental', 'Home Food', 'Barbecue', 'Korean', 'Kerala', 'Andhra',
       'British', 'Oriya', 'Portuguese', 'Lucknowi', 'Mangalorean',
       'Chettinad', 'Mediterranean', 'Naga', 'Turkish', 'Assamese',
       'Steakhouse', 'Coas

In [107]:
data_1['cuisine_2'].unique()

array(['Pizzas', 'Bakery', '0', 'Indian', 'Fast Food', 'Chaat',
       'Beverages', 'Desserts', 'Chinese', 'North Indian', 'Tandoor',
       'American', 'Thalis', 'Snacks', 'South Indian', 'Italian',
       'Street Food', 'Kebabs', 'Biryani', 'Salads', 'Pastas',
       'Continental', 'Bengali', 'Burgers', 'Ice Cream', 'Tibetan',
       'Thai', 'Hyderabadi', 'Sweets', 'Lebanese', 'Nepalese', 'Mughlai',
       'Lucknowi', 'Healthy Food', 'Afghani', 'Asian', 'Combo', 'Seafood',
       'Waffle', 'Italian-American', 'Punjabi', 'Arabian', 'Barbecue',
       'Mexican', 'Ice Cream Cakes', 'Gujarati', 'Juices', 'Jain',
       'Pan-Asian', 'Rajasthani', 'Mediterranean', 'Burmese', 'Oriental',
       'Maharashtrian', 'Kerala', 'Home Food', 'Indonesian',
       'Middle Eastern', 'Grill', 'Japanese', 'Paan',
       'Biryani - Shivaji Military Hotel', 'Greek', 'Cafe',
       'Use Code JUMBO30 to avail', 'Chettinad', 'Coastal', 'Andhra',
       'Turkish', 'African', 'Tex-Mex', 'Oriya', 'British', 'Ma

In [108]:
data_1['cuisine_1'].isnull().sum()

np.int64(0)

In [109]:
data_1['cuisine_2'].isnull().sum()

np.int64(0)

In [110]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148442 entries, 0 to 148441
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            148442 non-null  int64  
 1   city          148442 non-null  object 
 2   rating        148442 non-null  float64
 3   rating_count  148442 non-null  int64  
 4   cost          148442 non-null  int64  
 5   main_city     148442 non-null  object 
 6   cuisine_1     148442 non-null  object 
 7   cuisine_2     148442 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 9.1+ MB


In [111]:
data_1

Unnamed: 0,id,city,rating,rating_count,cost,main_city,cuisine_1,cuisine_2
0,567335,Abohar,4.0,10,200,0,Beverages,Pizzas
1,531342,Abohar,4.4,75,200,0,Sweets,Bakery
2,158203,Abohar,3.8,250,100,0,Beverages,0
3,187912,Abohar,3.7,35,250,0,Fast Food,Indian
4,543530,Abohar,4.0,10,250,0,Italian-American,Fast Food
...,...,...,...,...,...,...,...,...
148437,553122,Yavatmal,4.0,10,200,0,Fast Food,Snacks
148438,562647,Yavatmal,4.0,10,300,0,Pizzas,0
148439,559435,Yavatmal,4.0,10,300,0,Fast Food,Snacks
148440,418989,Yavatmal,4.0,10,250,0,Continental,0


In [124]:
data_1.iloc[[71600, 345, 68, 23464,2345,987,7654,45435]]#sample

Unnamed: 0,id,city,rating,rating_count,cost,main_city,cuisine_1,cuisine_2
71600,410464,Himayath Nagar,4.0,10,350,Hyderabad,Combo,0
345,530618,Adityapur,4.6,75,100,0,Bengali,Sweets
68,500628,Adilabad,4.0,10,250,0,Chinese,Snacks
23464,393107,Central Bangalore,4.2,750,270,Bangalore,Healthy Food,Pizzas
2345,209885,Ghatlodia,4.0,250,200,Ahmedabad,Gujarati,Punjabi
987,469550,Agra,4.0,10,250,0,Beverages,0
7654,469208,Aurangabad,4.1,35,200,0,North Indian,Punjabi
45435,443900,Sai Baba Colony,4.0,10,300,Coimbatore,Biryani,South Indian


#### Data Preprocessing

In [125]:
# perform one hot encoding

#initialize
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

#select categorical columns
categorical_data = data_1[['city','main_city', 'cuisine_1', 'cuisine_2']]
categorical_data = categorical_data.astype(str) #convert to str

#fit and transform data
encoder_array = encoder.fit_transform(categorical_data)

#convert encoder_array to dataframe
encoder_df = pd.DataFrame(encoder_array, columns=encoder.get_feature_names_out(['city','main_city', 'cuisine_1', 'cuisine_2']))
print(encoder_df)


        city_Abids & Koti  city_Abohar  city_Adajan  city_Adilabad  \
0                     0.0          1.0          0.0            0.0   
1                     0.0          1.0          0.0            0.0   
2                     0.0          1.0          0.0            0.0   
3                     0.0          1.0          0.0            0.0   
4                     0.0          1.0          0.0            0.0   
...                   ...          ...          ...            ...   
148437                0.0          0.0          0.0            0.0   
148438                0.0          0.0          0.0            0.0   
148439                0.0          0.0          0.0            0.0   
148440                0.0          0.0          0.0            0.0   
148441                0.0          0.0          0.0            0.0   

        city_Adityapur  city_Adoni  city_Adyar  city_Agartala  city_Agra  \
0                  0.0         0.0         0.0            0.0        0.0   
1      

In [126]:
encoder_df.dtypes

city_Abids & Koti                         float64
city_Abohar                               float64
city_Adajan                               float64
city_Adilabad                             float64
city_Adityapur                            float64
                                           ...   
cuisine_2_Use Code JUMBO30 to avail       float64
cuisine_2_Use code XPRESS121 to avail.    float64
cuisine_2_Vietnamese                      float64
cuisine_2_Waffle                          float64
cuisine_2_indian                          float64
Length: 1078, dtype: object

In [128]:
final_encoded_data = pd.concat([data_1[['id', 'rating', 'cost', 'rating_count']], encoder_df], axis= 1)

In [129]:
# #save the encoded_data as a pickle file
with open('final_data.pkl', 'wb') as file:
    pickle.dump(final_encoded_data, file)

In [130]:
# scaled numerical_features
numerical_features = final_encoded_data[['rating', 'cost', 'rating_count']]

In [131]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_features)

In [132]:
scaled_data

array([[ 0.14534067, -0.10995974, -0.27056619],
       [ 1.4764935 , -0.10995974, -0.11952945],
       [-0.52023575, -0.23548656,  0.2871079 ],
       ...,
       [ 0.14534067,  0.01556708, -0.27056619],
       [ 0.14534067, -0.04719633, -0.27056619],
       [ 0.14534067, -0.10995974, -0.27056619]], shape=(148442, 3))

In [133]:
# replace the noramalized numerical values to the original value
final_encoded_data[['rating', 'cost', 'rating_count']] = scaled_data

In [134]:
# Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [135]:
final_encoded_data

Unnamed: 0,id,rating,cost,rating_count,city_Abids & Koti,city_Abohar,city_Adajan,city_Adilabad,city_Adityapur,city_Adoni,...,cuisine_2_Thai,cuisine_2_Thalis,cuisine_2_Tibetan,cuisine_2_Tribal,cuisine_2_Turkish,cuisine_2_Use Code JUMBO30 to avail,cuisine_2_Use code XPRESS121 to avail.,cuisine_2_Vietnamese,cuisine_2_Waffle,cuisine_2_indian
0,567335,0.145341,-0.109960,-0.270566,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,531342,1.476494,-0.109960,-0.119529,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,158203,-0.520236,-0.235487,0.287108,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,187912,-0.853024,-0.047196,-0.212475,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,543530,0.145341,-0.047196,-0.270566,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148437,553122,0.145341,-0.109960,-0.270566,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148438,562647,0.145341,0.015567,-0.270566,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148439,559435,0.145341,0.015567,-0.270566,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148440,418989,0.145341,-0.047196,-0.270566,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
final_encoded_data.to_csv('encoded_data.csv', index= False)

In [137]:
final_encoded_data = pd.read_csv('encoded_data.csv')

In [138]:
cleaned_data = pd.read_csv('cleaned_data.csv')

In [139]:
# # ensure the indiced match
list(cleaned_data.index ) == list(final_encoded_data.index)

True

In [140]:
print(final_encoded_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148442 entries, 0 to 148441
Columns: 1082 entries, id to cuisine_2_indian
dtypes: float64(1081), int64(1)
memory usage: 1.2 GB
None


In [141]:
final_encoded_data.dtypes

id                                          int64
rating                                    float64
cost                                      float64
rating_count                              float64
city_Abids & Koti                         float64
                                           ...   
cuisine_2_Use Code JUMBO30 to avail       float64
cuisine_2_Use code XPRESS121 to avail.    float64
cuisine_2_Vietnamese                      float64
cuisine_2_Waffle                          float64
cuisine_2_indian                          float64
Length: 1082, dtype: object