In [90]:
import pandas as pd
import numpy as np

In [91]:
data = pd.read_csv('zomato.csv', encoding='latin-1')

In [92]:
data.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

## Drop columns not necessary
##### Restaurant ID, Is delivering now

In [93]:
data.drop(columns='Restaurant ID',inplace=True)

In [94]:
data.shape

(9551, 20)

In [95]:
data['Country Code'].nunique()

15

In [96]:
data['Country Code'].value_counts()

1      8652
216     434
215      80
214      60
30       60
189      60
148      40
208      34
14       24
162      22
94       21
191      20
166      20
184      20
37        4
Name: Country Code, dtype: int64

In [97]:
data = data[data['Country Code']==1]

In [98]:
data.drop(columns=['Country Code', 'Currency'], inplace=True)

In [99]:
data['City'].nunique()

43

In [100]:
data['City'].value_counts()

New Delhi       5473
Gurgaon         1118
Noida           1080
Faridabad        251
Ghaziabad         25
Bhubaneshwar      21
Ahmedabad         21
Guwahati          21
Lucknow           21
Amritsar          21
Kochi             20
Kanpur            20
Mysore            20
Pune              20
Dehradun          20
Puducherry        20
Aurangabad        20
Chennai           20
Varanasi          20
Nashik            20
Bhopal            20
Coimbatore        20
Agra              20
Mangalore         20
Nagpur            20
Mumbai            20
Ludhiana          20
Patna             20
Vizag             20
Bangalore         20
Indore            20
Vadodara          20
Jaipur            20
Surat             20
Allahabad         20
Ranchi            20
Goa               20
Kolkata           20
Chandigarh        18
Hyderabad         18
Secunderabad       2
Panchkula          1
Mohali             1
Name: City, dtype: int64

In [102]:
data = data[data['City'].isin(['New Delhi', 'Gurgaon', 'Noida'])]

In [105]:
data = data.sample(frac=1)

In [108]:
data['City'].value_counts()

New Delhi    5473
Gurgaon      1118
Noida        1080
Name: City, dtype: int64

In [109]:
data['Restaurant Name'].nunique()

5834

In [110]:
data.drop(columns=['Restaurant Name', 'Address', 'Locality', 'Locality Verbose', 'Rating color', 'Rating text'],inplace=True)

In [111]:
data.columns

Index(['City', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two',
       'Has Table booking', 'Has Online delivery', 'Is delivering now',
       'Switch to order menu', 'Price range', 'Aggregate rating', 'Votes'],
      dtype='object')

In [112]:
data.drop('Switch to order menu', axis=1, inplace=True)

In [113]:
data.isna().sum()

City                    0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Price range             0
Aggregate rating        0
Votes                   0
dtype: int64

In [114]:
data['Has Table booking'] = data['Has Table booking'].map({'Yes':1,'No':0})

In [115]:
data['Has Online delivery'] = data['Has Online delivery'].map({'Yes':1,'No':0})

In [116]:
data.corr()['Average Cost for two']

Longitude               0.071854
Latitude                0.070816
Average Cost for two    1.000000
Has Table booking       0.643845
Has Online delivery     0.072198
Price range             0.848425
Aggregate rating        0.329785
Votes                   0.297741
Name: Average Cost for two, dtype: float64

In [117]:
data.drop(['Longitude', 'Latitude', 'Price range', 'Has Online delivery'], axis=1, inplace=True)

In [118]:
data.columns

Index(['City', 'Cuisines', 'Average Cost for two', 'Has Table booking',
       'Is delivering now', 'Aggregate rating', 'Votes'],
      dtype='object')

In [119]:
index = data.groupby('Cuisines')['Average Cost for two'].mean().index
values = data.groupby('Cuisines')['Average Cost for two'].mean().values

In [120]:
my_map={}
for i in range(len(index)):
  my_map[index[i]]=values[i]

In [121]:
data['Cuisines'] = data['Cuisines'].map(my_map)

In [122]:
data = data.reset_index(drop=True)

In [123]:
data = data[['City', 'Cuisines', 'Has Table booking', 'Aggregate rating', 'Votes','Average Cost for two']]

In [124]:
data = pd.concat([data, pd.get_dummies(data['City'], drop_first=True)],axis=1).drop('City',axis=1)

In [125]:
y=data['Average Cost for two'].values
X=data.drop('Average Cost for two', axis=1).values

In [126]:
X.shape

(7671, 6)

In [127]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

In [128]:
X_train.shape

(6136, 6)

In [129]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()

In [130]:
linear_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [131]:
y_pred = linear_reg.predict(X_test)

In [132]:
from sklearn.metrics import r2_score
r2_score(y_pred, y_test)

0.5751940879615052

In [133]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=200)

In [135]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [137]:
y_predict_rf = rf_reg.predict(X_test)

In [138]:
r2_score(y_test, y_predict_rf)

0.6384098881966278

In [139]:
from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor()

In [140]:
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [141]:
y_pred_dt = dt_reg.predict(X_test)

In [142]:
r2_score(y_test, y_pred_dt)

0.543376554437106

In [143]:
from sklearn.svm import SVR
svr_reg = SVR()

In [145]:
svr_reg.fit(X_train,y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [146]:
r2_score(y_test, svr_reg.predict(X_test))

0.3296766472970597