In [1]:
import pandas as pd
import numpy as np
import seaborn as sbn
import matplotlib.pyplot as plt
import joblib
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from scipy.stats import zscore

import warnings
warnings.filterwarnings('ignore')

In [3]:
res=pd.read_excel('restaurantprice.xlsx')

In [4]:
res

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49 votes,1200
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30 votes,1500
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221 votes,800
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24 votes,800
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165 votes,300
...,...,...,...,...,...,...,...,...,...
12685,QUICK BITES,13228,"North Indian, Burger, Kebab","12noon – 12midnight (Mon, Tue, Wed, Thu, Sun)...",Hyderabad,Gachibowli,3.8,546 votes,500
12686,"CASUAL DINING,BAR",9686,"Goan, Continental","12noon – 1am (Mon-Fri),11am – 5pm, 7pm – 1am...",Mumbai,Bandra Kurla Complex,4.3,1214 votes,1800
12687,LOUNGE,11133,"Finger Food, Continental, Asian, Chinese",12noon – 12:30AM (Mon-Sun),Navi Mumbai,Vashi,4.0,608 votes,1300
12688,CASUAL DINING,6134,"North Indian, South Indian, Chinese, Street Food",6am – 10:45pm (Mon-Sun),Chennai,Maduravoyal,3.5,32 votes,400


In [6]:
#Extracting the different cuisines

res['Cus1']=res['CUISINES'].str.split(',').str[0]
res['Cus2']=res['CUISINES'].str.split(',').str[1]
res['Cus3']=res['CUISINES'].str.split(',').str[2]
res['Cus4']=res['CUISINES'].str.split(',').str[3]





In [9]:
#Filling null values in columns

res['Cus2'].fillna('not given',inplace=True)
res['Cus3'].fillna('not given',inplace=True)
res['Cus4'].fillna('not given',inplace=True)




In [10]:
res.head()

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,Cus1,Cus2,Cus3,Cus4
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49 votes,1200,Malwani,Goan,North Indian,not given
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30 votes,1500,Asian,Modern Indian,Japanese,not given
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221 votes,800,North Indian,Chinese,Biryani,Hyderabadi
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24 votes,800,Tibetan,Chinese,not given,not given
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165 votes,300,Desserts,not given,not given,not given


In [11]:
resf=res.copy()

In [12]:
res=res.drop(['RESTAURANT_ID','TIME'],axis=1)

In [17]:
#Encoding the features using label encoder
from sklearn.preprocessing import LabelEncoder

le_TITLE=LabelEncoder()
res['Titlecode']=le_TITLE.fit_transform(res['TITLE'])


le_Cus1=LabelEncoder()
res['Cuscode1']=le_Cus1.fit_transform(res['Cus1'])

le_Cus2=LabelEncoder()
res['Cuscode2']=le_Cus2.fit_transform(res['Cus2'])

le_Cus3=LabelEncoder()
res['Cuscode3']=le_Cus3.fit_transform(res['Cus3'])

le_Cus4=LabelEncoder()
res['Cuscode4']=le_Cus4.fit_transform(res['Cus4'])

In [18]:
res.head()

Unnamed: 0,TITLE,CUISINES,CITY,LOCALITY,RATING,VOTES,COST,Cus1,Cus2,Cus3,Cus4,Titlecode,Cuscode1,Cuscode2,Cuscode3,Cuscode4
0,CASUAL DINING,"Malwani, Goan, North Indian",Thane,Dombivali East,3.6,49 votes,1200,Malwani,Goan,North Indian,not given,31,59,32,66,85
1,"CASUAL DINING,BAR","Asian, Modern Indian, Japanese",Chennai,Ramapuram,4.2,30 votes,1500,Asian,Modern Indian,Japanese,not given,33,6,60,41,85
2,CASUAL DINING,"North Indian, Chinese, Biryani, Hyderabadi",Chennai,Saligramam,3.8,221 votes,800,North Indian,Chinese,Biryani,Hyderabadi,31,72,23,12,32
3,QUICK BITES,"Tibetan, Chinese",Mumbai,Bandra West,4.1,24 votes,800,Tibetan,Chinese,not given,not given,95,100,23,93,85
4,DESSERT PARLOR,Desserts,Mumbai,Lower Parel,3.8,165 votes,300,Desserts,not given,not given,not given,50,28,98,93,85


In [19]:
res=res.drop(['TITLE','CUISINES','CITY','LOCALITY','VOTES'],axis=1)

In [20]:
res.head()

Unnamed: 0,RATING,COST,Cus1,Cus2,Cus3,Cus4,Titlecode,Cuscode1,Cuscode2,Cuscode3,Cuscode4
0,3.6,1200,Malwani,Goan,North Indian,not given,31,59,32,66,85
1,4.2,1500,Asian,Modern Indian,Japanese,not given,33,6,60,41,85
2,3.8,800,North Indian,Chinese,Biryani,Hyderabadi,31,72,23,12,32
3,4.1,800,Tibetan,Chinese,not given,not given,95,100,23,93,85
4,3.8,300,Desserts,not given,not given,not given,50,28,98,93,85


In [21]:
res=res.drop(['Cus1','Cus2','Cus3','Cus4'],axis=1)

In [22]:
res.head()

Unnamed: 0,RATING,COST,Titlecode,Cuscode1,Cuscode2,Cuscode3,Cuscode4
0,3.6,1200,31,59,32,66,85
1,4.2,1500,33,6,60,41,85
2,3.8,800,31,72,23,12,32
3,4.1,800,95,100,23,93,85
4,3.8,300,50,28,98,93,85


In [30]:
x=res.drop(['COST','RATING'],axis=1)

In [24]:
y=res['COST']

In [26]:
xcp=x.copy()

In [27]:
x=x.replace('-',np.nan)

In [31]:
x.isnull().sum()

Titlecode    0
Cuscode1     0
Cuscode2     0
Cuscode3     0
Cuscode4     0
dtype: int64

In [32]:
#Bringing the features to common scale

sc=StandardScaler()
x=pd.DataFrame(sc.fit_transform(x))

In [33]:
def r2score(model,x,y):
    max_acc=0
    for rst in range(40,100):
        x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=rst,test_size=.22)
        model.fit(x_train,y_train)
        predy=model.predict(x_test)
        acc=r2_score(y_test,predy)
        print('Accuracy is',acc,'for the random state',rst)
        if acc>max_acc:
            max_acc=acc
            f_rst=rst
    print('Maximum r2score is ',max_acc,'and acquired at',f_rst)
    return f_rst

In [34]:
dtr=DecisionTreeRegressor()
params={'criterion': ['mse'],
 'max_depth': [None],
 'max_features': [None],
 'max_leaf_nodes': [4,8,16],
 'min_impurity_decrease': [0.0],
 'min_impurity_split': [None],
 'min_samples_leaf': [1,2,4],
 'min_samples_split': [2,4],
 'min_weight_fraction_leaf': [0.0],
 'presort': [False],
 'random_state': [None],
 'splitter': ['best']}

In [36]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.22,random_state=45)
from sklearn.model_selection import GridSearchCV
gcv=GridSearchCV(estimator=dtr,param_grid=params)
gcv.fit(x_train,y_train)
print(gcv.best_params_)
print(gcv.best_score_)

{'criterion': 'mse', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': 16, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
0.6419182633687872


In [37]:
dtr=DecisionTreeRegressor(criterion='mse',
 max_leaf_nodes=16,
 min_impurity_decrease=0.0,
 min_samples_leaf= 4,
 min_samples_split=2,
 splitter='best')
dtr.fit(x_train,y_train)
dtrpr=dtr.predict(x_test)
print('Mean Absolute Error:',mean_absolute_error(y_test,dtrpr))
print('Mean Squared Error:',mean_squared_error(y_test,dtrpr))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,dtrpr)))
print('R2 sCcore is:',r2_score(y_test,dtrpr))

Mean Absolute Error: 225.55998908323193
Mean Squared Error: 142079.06114218806
Root Mean Squared Error: 376.9337622742066
R2 sCcore is: 0.6337702484406001


In [38]:
Lr=LinearRegression()
r2score(Lr,x,y)

Accuracy is 0.09443690400862892 for the random state 40
Accuracy is 0.09230797025576498 for the random state 41
Accuracy is 0.10376670254790299 for the random state 42
Accuracy is 0.09373149507146883 for the random state 43
Accuracy is 0.08865199746060015 for the random state 44
Accuracy is 0.10878073834626023 for the random state 45
Accuracy is 0.09348884209390285 for the random state 46
Accuracy is 0.08362020884124077 for the random state 47
Accuracy is 0.0917935399229205 for the random state 48
Accuracy is 0.0880463131335395 for the random state 49
Accuracy is 0.08961697843584404 for the random state 50
Accuracy is 0.11139120175890593 for the random state 51
Accuracy is 0.07185308623813702 for the random state 52
Accuracy is 0.09823754581945643 for the random state 53
Accuracy is 0.09879768963962232 for the random state 54
Accuracy is 0.10324022806240996 for the random state 55
Accuracy is 0.09006623435823724 for the random state 56
Accuracy is 0.10200767968754565 for the random sta

73

In [39]:
r2score(dtr,x,y)

Accuracy is 0.6779864281256079 for the random state 40
Accuracy is 0.6491759512039992 for the random state 41
Accuracy is 0.6648145447607996 for the random state 42
Accuracy is 0.6909317595910143 for the random state 43
Accuracy is 0.6628765342577039 for the random state 44
Accuracy is 0.6337702484406001 for the random state 45
Accuracy is 0.6627658343090526 for the random state 46
Accuracy is 0.6794489788605397 for the random state 47
Accuracy is 0.6735535255028282 for the random state 48
Accuracy is 0.6006896882225167 for the random state 49
Accuracy is 0.6711922940301969 for the random state 50
Accuracy is 0.6593068094032248 for the random state 51
Accuracy is 0.6001651348076387 for the random state 52
Accuracy is 0.6598646561796234 for the random state 53
Accuracy is 0.6501609859435868 for the random state 54
Accuracy is 0.683400490241556 for the random state 55
Accuracy is 0.6588601839414253 for the random state 56
Accuracy is 0.6649110206077835 for the random state 57
Accuracy is

77

In [40]:
rfr=RandomForestRegressor()
r2score(rfr,x,y)

Accuracy is 0.6861179489117273 for the random state 40
Accuracy is 0.669610049361189 for the random state 41
Accuracy is 0.6808731690620218 for the random state 42
Accuracy is 0.7039615593965567 for the random state 43
Accuracy is 0.6904953483492252 for the random state 44
Accuracy is 0.672206370056123 for the random state 45
Accuracy is 0.6681741050746652 for the random state 46
Accuracy is 0.6553867745085158 for the random state 47
Accuracy is 0.6712113702853416 for the random state 48
Accuracy is 0.6186763737201579 for the random state 49
Accuracy is 0.6911009331645821 for the random state 50
Accuracy is 0.6705639501370841 for the random state 51
Accuracy is 0.618032142587013 for the random state 52
Accuracy is 0.6673082254465987 for the random state 53
Accuracy is 0.6258147855659248 for the random state 54
Accuracy is 0.6659134365110837 for the random state 55
Accuracy is 0.687688833730619 for the random state 56
Accuracy is 0.6709175689822633 for the random state 57
Accuracy is 0.

94

In [41]:
knr=KNeighborsRegressor()
parameters={'n_neighbors':[10,100,300,500]}
gcv=GridSearchCV(estimator=knr,param_grid=parameters)
gcv.fit(x_train,y_train)
print(gcv.best_params_)
print(gcv.best_score_)

{'n_neighbors': 10}
0.3667387786699302


In [42]:
gbr=GradientBoostingRegressor()
r2score(gbr,x,y)

Accuracy is 0.7073731145140958 for the random state 40
Accuracy is 0.6621408727851306 for the random state 41
Accuracy is 0.7018850257000391 for the random state 42
Accuracy is 0.6991099368595077 for the random state 43
Accuracy is 0.6953224228828729 for the random state 44
Accuracy is 0.6757795974393019 for the random state 45
Accuracy is 0.6826459327073944 for the random state 46
Accuracy is 0.6901384622977851 for the random state 47
Accuracy is 0.6918247518171343 for the random state 48
Accuracy is 0.6301950025918367 for the random state 49
Accuracy is 0.7146112782443028 for the random state 50
Accuracy is 0.6932088241197356 for the random state 51
Accuracy is 0.6176476496696893 for the random state 52
Accuracy is 0.6873809702770809 for the random state 53
Accuracy is 0.6881536864963056 for the random state 54
Accuracy is 0.7151567435664984 for the random state 55
Accuracy is 0.6674318057419575 for the random state 56
Accuracy is 0.6772912986605334 for the random state 57
Accuracy i

79

In [43]:
ls=Lasso()
r2score(ls,x,y)

Accuracy is 0.09428254810135617 for the random state 40
Accuracy is 0.09224779555160778 for the random state 41
Accuracy is 0.1035814588677072 for the random state 42
Accuracy is 0.09370671732885205 for the random state 43
Accuracy is 0.08872533359105994 for the random state 44
Accuracy is 0.10849404400968476 for the random state 45
Accuracy is 0.09340312389618743 for the random state 46
Accuracy is 0.08369258605391006 for the random state 47
Accuracy is 0.09183348866120766 for the random state 48
Accuracy is 0.08817393443473331 for the random state 49
Accuracy is 0.0896236501430554 for the random state 50
Accuracy is 0.11150060278928231 for the random state 51
Accuracy is 0.07200389490665593 for the random state 52
Accuracy is 0.09827976775320846 for the random state 53
Accuracy is 0.09898399312696271 for the random state 54
Accuracy is 0.10310212049858625 for the random state 55
Accuracy is 0.09014186631678878 for the random state 56
Accuracy is 0.10196234204785859 for the random sta

73

In [46]:
#As Random forest is performing well than others ,fitting it as final model

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.22,random_state=94)
rf=RandomForestRegressor()
rf.fit(x_train,y_train)
rfp=rf.predict(x_test)
print('Mean Absolute Error:',mean_absolute_error(y_test,rfp))
print('Mean Squared Error:',mean_squared_error(y_test,rfp))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,rfp)))
print('R2 score is:',r2_score(y_test,rfp))

Mean Absolute Error: 188.25563605274363
Mean Squared Error: 97702.66488229511
Root Mean Squared Error: 312.5742549895866
R2 score is: 0.7236657956102406


In [48]:
#Cross validating the performance of the rfregressor..

cvs=cross_val_score(rf,x,y,cv=5,scoring='r2')
print('mean accuracy of random forest regression is ',cvs.mean())
print('Standard deviation of the model is ',cvs.std())
print('Rmse of random forest model is',np.sqrt(mean_squared_error(y_test,rfp)))

mean accuracy of random forest regression is  0.662321734110743
Standard deviation of the model is  0.028501072271629742
Rmse of random forest model is 312.5742549895866


As the random forest regressor is performing well than other models,considering this model for predictions

In [50]:
#Saving predictions and the model
predct=pd.DataFrame(rfp,columns=['COST'])
predct=predct.to_csv('foodcostpredictions.csv',index=False)

In [51]:
#Saving the model
import joblib
joblib.dump(rf,'food_cost_pred.pkl')

['food_cost_pred.pkl']