In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import regex as re

In [2]:
train_data=pd.read_excel('Data_Train.xlsx')
test_data=pd.read_excel('Data_Test.xlsx')

In [3]:
train_data.shape , test_data.shape

((12690, 9), (4231, 8))

In [4]:
train_data.duplicated().sum() , test_data.duplicated().sum()

(25, 1)

In [5]:
train_data.head()

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49 votes,1200
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30 votes,1500
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221 votes,800
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24 votes,800
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165 votes,300


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12690 entries, 0 to 12689
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   TITLE          12690 non-null  object
 1   RESTAURANT_ID  12690 non-null  int64 
 2   CUISINES       12690 non-null  object
 3   TIME           12690 non-null  object
 4   CITY           12578 non-null  object
 5   LOCALITY       12592 non-null  object
 6   RATING         12688 non-null  object
 7   VOTES          11486 non-null  object
 8   COST           12690 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 892.4+ KB


In [7]:
for i in train_data.columns:
    print('The unique values in',i,train_data[i].nunique())

The unique values in TITLE 113
The unique values in RESTAURANT_ID 11892
The unique values in CUISINES 4155
The unique values in TIME 2689
The unique values in CITY 359
The unique values in LOCALITY 1416
The unique values in RATING 32
The unique values in VOTES 1847
The unique values in COST 86


# Data Preprocessing

In [8]:
df=train_data.append(test_data,ignore_index=True)

In [9]:
df=df[['TITLE','CUISINES','TIME','CITY','LOCALITY','RATING','VOTES','COST']]

In [10]:
def extract_closed(time):
    a=re.findall('Closed \(.*?\)',time)
    if a !=[]:
        return a[0]
    else:
        return 'NA'
df['CLOSED']=df['TIME'].apply(extract_closed)

In [11]:
df['TIME']=df['TIME'].str.replace(r'Closed \(.*?\)','')

In [12]:
df['RATING']=df['RATING'].str.replace('NEW','1')
df['RATING']=df['RATING'].str.replace('-','1').astype(float)

In [13]:
df['VOTES']=df['VOTES'].str.replace(' votes','').astype(float)

In [14]:
df['CITY'].fillna('Missing',inplace=True)
df['LOCALITY'].fillna('Missing',inplace=True)
df['RATING'].fillna(3.8,inplace=True)
df['VOTES'].fillna(0.0,inplace=True)

In [15]:
df['COST']=df['COST'].astype(float)

In [16]:
df.head()

Unnamed: 0,TITLE,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,CLOSED
0,CASUAL DINING,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49.0,1200.0,
1,"CASUAL DINING,BAR","Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30.0,1500.0,
2,CASUAL DINING,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221.0,800.0,
3,QUICK BITES,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24.0,800.0,
4,DESSERT PARLOR,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165.0,300.0,


In [17]:
calc_mean=df.groupby(['CITY'],axis=0).agg({'RATING':'mean'}).reset_index()
calc_mean.columns=['CITY','CITY_MEAN_RATING']
df=df.merge(calc_mean,on=['CITY'],how='left')

In [18]:
df.head(2)

Unnamed: 0,TITLE,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,CLOSED,CITY_MEAN_RATING
0,CASUAL DINING,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49.0,1200.0,,3.376271
1,"CASUAL DINING,BAR","Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30.0,1500.0,,3.584588


In [19]:
calc_mean=df.groupby(['LOCALITY'],axis=0).agg({'RATING':'mean'}).reset_index()
calc_mean.columns=['LOCALITY','LOCALITY_MEAN_RATING']
df=df.merge(calc_mean,on=['LOCALITY'],how='left')

In [20]:
df.head(3)

Unnamed: 0,TITLE,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,CLOSED,CITY_MEAN_RATING,LOCALITY_MEAN_RATING
0,CASUAL DINING,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49.0,1200.0,,3.376271,3.388889
1,"CASUAL DINING,BAR","Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30.0,1500.0,,3.584588,3.472222
2,CASUAL DINING,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221.0,800.0,,3.584588,3.55


In [21]:
df.drop(['TITLE','CUISINES','CITY','LOCALITY','TIME'],axis=1,inplace=True)

In [22]:
df=pd.get_dummies(df,columns=['CLOSED'],drop_first=True)

In [23]:
df.shape

(16921, 28)

In [24]:
train_df=df[df['COST'].isnull()!=True]
test_df=df[df['COST'].isnull()==True]
test_df.drop('COST',axis=1,inplace=True)

In [25]:
train_df.shape,test_df.shape

((12690, 28), (4231, 27))

In [26]:
train_df['COST']=np.log1p(train_df['COST'])

# Train Test Split

In [27]:
X=train_df.drop(labels=['COST'],axis=1)
Y=train_df['COST'].values

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=1)

In [28]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((9517, 27), (3173, 27), (9517,), (3173,))

In [29]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.linear_model import Lasso,Ridge
from sklearn.svm import SVR

In [30]:
LR=LinearRegression()
LR.fit(x_train,y_train)
pred_LR=LR.predict(x_test)
pred_train=LR.predict(x_train)
print('R2 Score: ',r2_score(y_test,pred_LR))
print('R2 Score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error: ',mean_absolute_error(y_test,pred_LR))
print('Mean Squared Error: ',mean_squared_error(y_test,pred_LR))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,pred_LR)))

R2 Score:  0.18921733970933308
R2 Score on training data:  16.191552458296222
Mean Absolute Error:  0.49655364864708756
Mean Squared Error:  0.4145109218046264
Root Mean Squared Error:  0.6438252261325479


In [31]:
ls=Lasso(alpha=0.01)
ls.fit(x_train,y_train)
print(ls.score(x_train,y_train))
predlasso=ls.predict(x_test)
print(mean_squared_error(y_test,predlasso))
print('Mean squared error:',np.sqrt(mean_squared_error(y_test,predlasso)))

0.1518986454448633
0.4204858614025913
Mean squared error: 0.6484488117057439


In [32]:
rd=Ridge(alpha=0.01)
rd.fit(x_train,y_train)
print(rd.score(x_train,y_train))
predridge=rd.predict(x_test)
print(mean_squared_error(y_test,predridge))
print('Mean squared error:',np.sqrt(mean_squared_error(y_test,predridge)))

0.16191322741598047
0.4144723462426386
Mean squared error: 0.6437952673347628


In [33]:
gr=GradientBoostingRegressor()
gr.fit(x_train,y_train)
pred_gr=gr.predict(x_test)
pred_train=gr.predict(x_train)
print('R2 Score: ',r2_score(y_test,pred_gr))
print('R2 Score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error: ',mean_absolute_error(y_test,pred_gr))
print('Mean Squared Error: ',mean_squared_error(y_test,pred_gr))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,pred_gr)))

R2 Score:  0.29460818606464845
R2 Score on training data:  31.93545642463913
Mean Absolute Error:  0.4571624159702606
Mean Squared Error:  0.3606300743074066
Root Mean Squared Error:  0.6005248323819812


In [34]:
rf=RandomForestRegressor()
rf.fit(x_train,y_train)
pred_rf=rf.predict(x_test)
pred_train=rf.predict(x_train)
print('R2 Score: ',r2_score(y_test,pred_rf))
print('R2 Score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error: ',mean_absolute_error(y_test,pred_rf))
print('Mean Squared Error: ',mean_squared_error(y_test,pred_rf))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,pred_rf)))

R2 Score:  0.2563429828988705
R2 Score on training data:  86.37461263314891
Mean Absolute Error:  0.4704136387941097
Mean Squared Error:  0.38019307856751455
Root Mean Squared Error:  0.6165979878068972


In [35]:
dt=DecisionTreeRegressor()
dt.fit(x_train,y_train)
pred_dt=dt.predict(x_test)
pred_train=dt.predict(x_train)
print('R2 Score: ',r2_score(y_test,pred_dt))
print('R2 Score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error: ',mean_absolute_error(y_test,pred_dt))
print('Mean Squared Error: ',mean_squared_error(y_test,pred_dt))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,pred_dt)))

R2 Score:  -0.323851062273401
R2 Score on training data:  96.4515680606931
Mean Absolute Error:  0.6230721255982962
Mean Squared Error:  0.6768160581508407
Root Mean Squared Error:  0.8226883116653844


In [36]:
kn=KNN()
kn.fit(x_train,y_train)
pred_kn=kn.predict(x_test)
pred_train=kn.predict(x_train)
print('R2 Score: ',r2_score(y_test,pred_kn))
print('R2 Score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error: ',mean_absolute_error(y_test,pred_kn))
print('Mean Squared Error: ',mean_squared_error(y_test,pred_kn))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,pred_kn)))

R2 Score:  0.09033406296178226
R2 Score on training data:  38.51982097219505
Mean Absolute Error:  0.5252914176833721
Mean Squared Error:  0.465064788091055
Root Mean Squared Error:  0.6819565881279064
