In [28]:
#Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns

##Display all the columns of the dataframe
pd.pandas.set_option("display.max_columns",None)

In [30]:
zomato = pd.read_csv('zomato.csv')

##print shape of the dataset with rows and columns
print(zomato.shape)

(51717, 17)


In [31]:
zomato_real=zomato.drop(['url',"phone","dish_liked",'address','name',"reviews_list"], axis = 1)

In [32]:
#Remove the NaN values from the dataset
zomato_real.isnull().sum()
zomato_real.dropna(how='any',inplace=True)
zomato_real.info() #.info() function is used to get a concise summary of the dataf

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43533 entries, 0 to 51716
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   online_order                 43533 non-null  object
 1   book_table                   43533 non-null  object
 2   rate                         43533 non-null  object
 3   votes                        43533 non-null  int64 
 4   location                     43533 non-null  object
 5   rest_type                    43533 non-null  object
 6   cuisines                     43533 non-null  object
 7   approx_cost(for two people)  43533 non-null  object
 8   menu_item                    43533 non-null  object
 9   listed_in(type)              43533 non-null  object
 10  listed_in(city)              43533 non-null  object
dtypes: int64(1), object(10)
memory usage: 4.0+ MB


In [33]:
zomato_real.duplicated().sum()
zomato_real.drop_duplicates(inplace=True)
zomato_real.head()
zomato_real.shape

(43445, 11)

In [34]:
zomato_real.rename(columns={"approx_cost(for two people)":'cost','listed_in(type)':'type', 'listed_in(city)':'city'}, inplace=True)

In [35]:
zomato_real['cost'] = zomato_real['cost'].str.replace(',','')
zomato_real['cost']=zomato_real['cost'].astype(float)

In [36]:
zomato_real = zomato_real.loc[zomato_real.rate !='NEW']

In [37]:
zomato_real = zomato_real.loc[zomato_real.rate !='-'].reset_index(drop=True)

In [38]:
rate=zomato_real.rate.str.split("/",n = 1, expand = True)
zomato_real['rate']=rate[0]

In [39]:
#Encode the input Variables
def Encode(zomato_real):
    for column in zomato_real.columns[~zomato_real.columns.isin(['rate', 'cost', 'votes'])]:
        zomato_real[column] = zomato_real[column].factorize()[0]
    return zomato_real

zomato_en = Encode(zomato_real.copy())
zomato_en.head() # looking at the dataset after transformation

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,menu_item,type,city
0,0,0,4.1,775,0,0,0,800.0,0,0,0
1,0,1,4.1,787,0,0,1,800.0,0,0,0
2,0,1,3.8,918,0,1,2,800.0,0,0,0
3,1,1,3.7,88,0,2,3,300.0,0,0,0
4,1,1,3.8,166,1,0,4,600.0,0,0,0


In [40]:
zomato_en["cost"]=np.log(zomato_en["cost"])

In [41]:
zomato_en

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,menu_item,type,city
0,0,0,4.1,775,0,0,0,6.684612,0,0,0
1,0,1,4.1,787,0,0,1,6.684612,0,0,0
2,0,1,3.8,918,0,1,2,6.684612,0,0,0
3,1,1,3.7,88,0,2,3,5.703782,0,0,0
4,1,1,3.8,166,1,0,4,6.396930,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
41197,1,1,3.7,34,25,28,204,6.684612,0,6,29
41198,1,1,2.5,81,25,28,761,6.684612,0,6,29
41199,1,1,3.6,27,25,17,240,7.313220,0,6,29
41200,1,0,4.3,236,56,17,237,7.824046,0,6,29


In [42]:
feature_scale=[feature for feature in zomato_en.columns if feature not in ['rate']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(zomato_en[feature_scale])

MinMaxScaler()

In [44]:
zomato_en=pd.concat([zomato_en['rate'].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(zomato_en[feature_scale]), columns=feature_scale)],
                    axis=1)

In [43]:
scaler.transform(zomato_en[feature_scale])

array([[0.00000000e+00, 0.00000000e+00, 4.60432510e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 4.67561787e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 5.45389734e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.00000000e+00, 1.00000000e+00, 1.60408745e-03, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 1.40209125e-02, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 7.72338403e-04, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00]])

In [45]:
zomato_en.head()

Unnamed: 0,rate,online_order,book_table,votes,location,rest_type,cuisines,cost,menu_item,type,city
0,4.1,0.0,0.0,0.046043,0.0,0.0,0.0,0.597875,0.0,0.0,0.0
1,4.1,0.0,1.0,0.046756,0.0,0.0,0.000423,0.597875,0.0,0.0,0.0
2,3.8,0.0,1.0,0.054539,0.0,0.011628,0.000845,0.597875,0.0,0.0,0.0
3,3.7,1.0,1.0,0.005228,0.0,0.023256,0.001268,0.402125,0.0,0.0,0.0
4,3.8,1.0,1.0,0.009862,0.010989,0.0,0.001691,0.54046,0.0,0.0,0.0


In [47]:
zomato_en['rate']

0         4.1
1         4.1
2         3.8
3         3.7
4         3.8
         ... 
41197    3.7 
41198    2.5 
41199    3.6 
41200    4.3 
41201    3.4 
Name: rate, Length: 41202, dtype: object

In [48]:
#Defining the independent variables and dependent variables
x = zomato_en.iloc[:,[1,3,4,5,6,7,8,9,10]]
y = zomato_en['rate']
# Getting Test and Training Set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=353)
x_train.head()


Unnamed: 0,online_order,votes,location,rest_type,cuisines,cost,menu_item,type,city
39286,0.0,0.089057,0.252747,0.325581,0.975486,0.69477,0.0,0.333333,0.965517
11445,1.0,0.048063,0.164835,0.313953,0.646661,0.678796,0.0,0.833333,0.275862
8997,1.0,0.000297,0.087912,0.023256,0.102705,0.321204,0.0,0.666667,0.206897
4712,0.0,0.003624,0.186813,0.174419,0.075232,0.26379,0.0,0.5,0.137931
4419,0.0,0.024121,0.340659,0.0,0.407861,0.642409,0.100946,0.333333,0.137931


In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

In [50]:
#Prepare a Linear Regression Model
reg=LinearRegression()
reg.fit(x_train,y_train)
y_pred=reg.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.24962145870894636

In [51]:
#Prepairng a Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=105)
DTree=DecisionTreeRegressor(min_samples_leaf=.0001)
DTree.fit(x_train,y_train)
y_predict=DTree.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

0.8248519392834525

In [52]:
#Preparing Random Forest REgression
from sklearn.ensemble import RandomForestRegressor
RForest=RandomForestRegressor(n_estimators=500,random_state=329,min_samples_leaf=.0001)
RForest.fit(x_train,y_train)
y_predict=RForest.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

0.8640279375718197

In [53]:
#Preparing Extra Tree Regression
from sklearn.ensemble import  ExtraTreesRegressor
ETree=ExtraTreesRegressor(n_estimators = 100)
ETree.fit(x_train,y_train)
y_predict=ETree.predict(x_test)


from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

0.9236257110581921