In [222]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [223]:
dataset = pd.read_excel('Data_Train.xlsx')
test_dataset = pd.read_excel('Data_Test.xlsx')

In [224]:
test_dataset.shape

(2774, 8)

In [225]:
dataset.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",₹200,₹50,3.5,12,4,30 minutes
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",₹100,₹50,3.5,11,4,30 minutes
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",₹150,₹50,3.6,99,30,65 minutes
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",₹250,₹99,3.7,176,95,30 minutes
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",₹200,₹99,3.2,521,235,65 minutes


In [226]:
test_dataset.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,ID_2842,"Mico Layout, Stage 2, BTM Layout,Bangalore","North Indian, Chinese, Assamese",₹350,₹50,4.2,361,225
1,ID_730,"Mico Layout, Stage 2, BTM Layout,Bangalore","Biryani, Kebab",₹100,₹50,NEW,-,-
2,ID_4620,"Sector 1, Noida",Fast Food,₹100,₹50,3.6,36,16
3,ID_5470,"Babarpur, New Delhi, Delhi","Mithai, North Indian, Chinese, Fast Food, Sout...",₹200,₹50,3.6,66,33
4,ID_3249,"Sector 1, Noida","Chinese, Fast Food",₹150,₹50,2.9,38,14


In [227]:
dataset.describe()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
count,11094,11094,11094,11094,11094,11094,11094,11094,11094
unique,7480,35,2179,26,18,33,1103,761,7
top,ID_7184,"Mico Layout, Stage 2, BTM Layout,Bangalore",North Indian,₹200,₹50,-,-,-,30 minutes
freq,22,947,850,3241,10118,1191,2074,2312,7406


In [228]:
test_dataset.describe()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
count,2774,2774,2774,2774,2774,2774,2774,2774
unique,2401,35,881,19,9,30,580,392
top,ID_1209,"D-Block, Sector 63, Noida",North Indian,₹200,₹50,-,-,-
freq,8,221,226,820,2556,305,542,593


# Applying NLP on 'Location' and 'Cuisines'

In [229]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [230]:
def nlp(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]',' ',text)
    text = nltk.word_tokenize(text)
    text = [WordNetLemmatizer().lemmatize(j) for j in text if j not in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

In [231]:
dataset['Location'] = dataset['Location'].apply(nlp)
dataset['Cuisines'] = dataset['Cuisines'].apply(nlp)

test_dataset['Location'] = test_dataset['Location'].apply(nlp)
test_dataset['Cuisines'] = test_dataset['Cuisines'].apply(nlp)

In [232]:
dataset.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time
0,ID_6321,fti college law college road pune,fast food roll burger salad wrap,₹200,₹50,3.5,12,4,30 minutes
1,ID_2882,sector marathalli,ice cream dessert,₹100,₹50,3.5,11,4,30 minutes
2,ID_1595,mumbai central,italian street food fast food,₹150,₹50,3.6,99,30,65 minutes
3,ID_5929,sector noida,mughlai north indian chinese,₹250,₹99,3.7,176,95,30 minutes
4,ID_6123,rmz centennial gate whitefield,cafe beverage,₹200,₹99,3.2,521,235,65 minutes


In [233]:
tv = TfidfVectorizer(ngram_range=(1,1),lowercase=False)#(max_features=2500)
#dataset['Location'] = pd.DataFrame(tv.fit_transform(dataset['Location']).toarray(), columns = tv.get_feature_names())
loc_tv_tr = tv.fit_transform(dataset['Location'])
loc_tv_tr = pd.DataFrame(data = loc_tv_tr.toarray(), columns=tv.get_feature_names())
dataset = pd.concat([dataset,loc_tv_tr],axis=1)
dataset.drop('Location', axis=1, inplace=True)

cui_tv_tr = tv.fit_transform(dataset['Cuisines'])
cui_tv_tr = pd.DataFrame(data = cui_tv_tr.toarray(), columns=tv.get_feature_names())
dataset = pd.concat([dataset,cui_tv_tr], axis=1)
dataset.drop('Cuisines',axis=1, inplace=True)

loc_tv_ts = tv.fit_transform(test_dataset['Location'])
loc_tv_ts = pd.DataFrame(data = loc_tv_ts.toarray(), columns=tv.get_feature_names())
test_dataset = pd.concat([test_dataset,loc_tv_ts],axis=1)
test_dataset.drop('Location', axis=1, inplace=True)

cui_tv_ts = tv.fit_transform(test_dataset['Cuisines'])
cui_tv_ts = pd.DataFrame(data = cui_tv_ts.toarray(), columns=tv.get_feature_names())
test_dataset = pd.concat([test_dataset,cui_tv_ts], axis=1)
test_dataset.drop('Cuisines',axis=1, inplace=True)

In [234]:
dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,administration,area,babarpur,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_6321,₹200,₹50,3.5,12,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,ID_2882,₹100,₹50,3.5,11,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_1595,₹150,₹50,3.6,99,30,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5929,₹250,₹99,3.7,176,95,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_6123,₹200,₹99,3.2,521,235,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [235]:
test_dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,administration,area,babarpur,bangalore,...,steak,street,sushi,tea,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_2842,₹350,₹50,4.2,361,225,0.0,0.0,0.0,0.355855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ID_730,₹100,₹50,NEW,-,-,0.0,0.0,0.0,0.355855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_4620,₹100,₹50,3.6,36,16,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5470,₹200,₹50,3.6,66,33,0.0,0.0,0.557432,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_3249,₹150,₹50,2.9,38,14,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [236]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Columns: 175 entries, Restaurant to yogurt
dtypes: float64(169), object(6)
memory usage: 3.7+ MB


In [237]:
dataset.shape

(11094, 192)

In [238]:
test_dataset.shape

(2774, 175)

In [239]:
dataset['Average_Cost'].value_counts()

₹200      3241
₹100      2557
₹150      2462
₹250       881
₹300       537
₹350       283
₹400       282
₹50        265
₹600       154
₹500       101
₹450        63
₹550        60
₹650        55
₹800        44
₹750        38
₹900        15
₹700        15
₹850        12
₹1,000      12
₹1,200       8
₹950         4
₹1,400       1
for          1
₹2,050       1
₹1,100       1
₹1,150       1
Name: Average_Cost, dtype: int64

In [240]:
#index=dataset[(dataset['Average_Cost']=='for') | (dataset['Average_Cost']=='₹1,000')| (dataset['Average_Cost']=='₹1,400')| (dataset['Average_Cost']=='₹1,150')| (dataset['Average_Cost']=='₹2,050')| (dataset['Average_Cost']=='₹1,100')| (dataset['Average_Cost']=='₹950')| (dataset['Average_Cost']=='₹1,200')].index
#dataset.drop(index, inplace = True)
#dataset=dataset.set_index('Average_Cost').drop('for',axis=0, index=None)
dataset['Average_Cost'] = dataset['Average_Cost'].replace('₹1,000','₹1000')
dataset['Average_Cost'] = dataset['Average_Cost'].replace('₹1,200','₹1200')
dataset['Average_Cost'] = dataset['Average_Cost'].replace('₹1,100','₹1100')
dataset['Average_Cost'] = dataset['Average_Cost'].replace('₹1,150','₹1150')
dataset['Average_Cost'] = dataset['Average_Cost'].replace('₹2,050','₹2050')
dataset['Average_Cost'] = dataset['Average_Cost'].replace('₹1,400','₹1400')

In [241]:
test_dataset['Average_Cost'].value_counts()

₹200      820
₹100      664
₹150      589
₹250      223
₹300      173
₹50        72
₹350       71
₹400       64
₹600       30
₹500       15
₹550       13
₹450        9
₹650        8
₹800        6
₹700        4
₹850        4
₹1,000      4
₹750        3
₹1,200      2
Name: Average_Cost, dtype: int64

In [242]:
#test_index = test_dataset[(test_dataset['Average_Cost']=='₹1,000') | (test_dataset['Average_Cost']=='₹1,200')].index
#test_dataset.drop(test_index, inplace = True)
test_dataset['Average_Cost'] = test_dataset['Average_Cost'].replace('₹1,000','₹1000')
test_dataset['Average_Cost'] = test_dataset['Average_Cost'].replace('₹1,200','₹1200')

In [243]:
test_dataset['Average_Cost']=test_dataset['Average_Cost'].str.replace('₹','').astype(int)
dataset['Average_Cost'] = [x.replace('₹','')for x in dataset['Average_Cost']]
dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,administration,area,babarpur,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_6321,200,₹50,3.5,12,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,ID_2882,100,₹50,3.5,11,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_1595,150,₹50,3.6,99,30,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5929,250,₹99,3.7,176,95,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_6123,200,₹99,3.2,521,235,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [244]:
dataset['Average_Cost'].value_counts()

200     3241
100     2557
150     2462
250      881
300      537
350      283
400      282
50       265
600      154
500      101
450       63
550       60
650       55
800       44
750       38
700       15
900       15
1000      12
850       12
1200       8
950        4
1150       1
for        1
2050       1
1100       1
1400       1
Name: Average_Cost, dtype: int64

In [245]:
test_dataset['Average_Cost'].value_counts()

200     820
100     664
150     589
250     223
300     173
50       72
350      71
400      64
600      30
500      15
550      13
450       9
650       8
800       6
1000      4
700       4
850       4
750       3
1200      2
Name: Average_Cost, dtype: int64

In [246]:
tr_avg_cst = dataset['Average_Cost'].drop(dataset[dataset['Average_Cost']=='for'].index).astype(int)
tr_avg_cst.mean()

202.7089155323177

In [247]:
dataset['Average_Cost']=dataset['Average_Cost'].replace('for','200')

In [248]:
dataset['Average_Cost']=dataset['Average_Cost'].astype(int)

In [249]:
dataset['Average_Cost'].value_counts()

200     3242
100     2557
150     2462
250      881
300      537
350      283
400      282
50       265
600      154
500      101
450       63
550       60
650       55
800       44
750       38
900       15
700       15
850       12
1000      12
1200       8
950        4
1150       1
1400       1
1100       1
2050       1
Name: Average_Cost, dtype: int64

In [250]:
dataset['Minimum_Order'].value_counts()

₹50     10118
₹99       779
₹0        158
₹199        8
₹200        8
₹59         3
₹350        3
₹299        3
₹90         2
₹450        2
₹79         2
₹300        2
₹240        1
₹250        1
₹500        1
₹400        1
₹150        1
₹89         1
Name: Minimum_Order, dtype: int64

In [251]:
test_dataset['Minimum_Order'].value_counts()

₹50     2556
₹99      177
₹0        30
₹199       5
₹200       2
₹89        1
₹399       1
₹500       1
₹149       1
Name: Minimum_Order, dtype: int64

In [252]:
#index = dataset[(dataset['Minimum_Order'] == '₹500') | (dataset['Minimum_Order'] == '₹250') | (dataset['Minimum_Order'] == '₹150') | (dataset['Minimum_Order'] == '₹240') | (dataset['Minimum_Order'] == '₹400') | (dataset['Minimum_Order'] == '₹300') | (dataset['Minimum_Order'] == '₹450') | (dataset['Minimum_Order'] == '₹299') | (dataset['Minimum_Order'] == '₹350') |(dataset['Minimum_Order'] == '₹200') |(dataset['Minimum_Order'] == '₹199') ].index
#dataset.drop(index, inplace = True)
dataset['Minimum_Order']=dataset['Minimum_Order'].str.replace('₹','').astype(int)
test_dataset['Minimum_Order']=test_dataset['Minimum_Order'].str.replace('₹','').astype(int)

In [253]:
dataset['Minimum_Order'].value_counts()

50     10118
99       779
0        158
200        8
199        8
299        3
59         3
350        3
300        2
79         2
90         2
450        2
500        1
250        1
150        1
89         1
400        1
240        1
Name: Minimum_Order, dtype: int64

In [254]:
dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,administration,area,babarpur,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_6321,200,50,3.5,12,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,ID_2882,100,50,3.5,11,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_1595,150,50,3.6,99,30,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5929,250,99,3.7,176,95,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_6123,200,99,3.2,521,235,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [255]:
dataset['Rating'].value_counts()

-                     1191
3.7                    869
3.6                    846
3.5                    818
3.8                    800
NEW                    758
3.9                    749
3.4                    718
3.3                    675
4.0                    614
3.2                    511
4.1                    459
3.1                    411
3.0                    302
4.2                    272
4.3                    247
2.9                    199
2.8                    157
4.4                    142
4.5                     78
2.7                     76
2.6                     42
4.6                     41
4.7                     36
2.5                     27
2.4                     13
4.8                     13
Opening Soon            12
4.9                      8
2.3                      6
Temporarily Closed       2
2.2                      1
2.1                      1
Name: Rating, dtype: int64

In [256]:
test_dataset['Rating'].value_counts()

-               305
3.6             223
3.9             216
3.7             212
NEW             200
3.5             197
3.4             185
3.8             183
3.3             153
4.0             141
3.2             129
3.1             120
4.1             115
4.2              70
3.0              65
2.9              57
4.3              52
2.8              41
4.4              29
2.7              22
4.5              18
2.6               9
4.6               7
2.5               6
4.7               6
2.4               5
4.8               3
Opening Soon      2
2.3               2
2.1               1
Name: Rating, dtype: int64

In [257]:
dataset['Rating']=dataset['Rating'].replace('-','NaN')#(to_replace=['-','NEW','Opening Soon','Temporarily Closed'], value='NaN')#.value_counts()
dataset['Rating']=dataset['Rating'].replace('NEW','NaN')
dataset['Rating']=dataset['Rating'].replace('Opening Soon','NaN')
dataset['Rating']=dataset['Rating'].replace('Temporarily Closed','NaN')

In [258]:
test_dataset['Rating']=test_dataset['Rating'].replace(['-','NEW','Opening Soon'], 'NaN')

In [259]:
dataset['Rating'].value_counts()

NaN    1963
3.7     869
3.6     846
3.5     818
3.8     800
3.9     749
3.4     718
3.3     675
4.0     614
3.2     511
4.1     459
3.1     411
3.0     302
4.2     272
4.3     247
2.9     199
2.8     157
4.4     142
4.5      78
2.7      76
2.6      42
4.6      41
4.7      36
2.5      27
2.4      13
4.8      13
4.9       8
2.3       6
2.2       1
2.1       1
Name: Rating, dtype: int64

In [260]:
test_dataset['Rating'].value_counts()

NaN    507
3.6    223
3.9    216
3.7    212
3.5    197
3.4    185
3.8    183
3.3    153
4.0    141
3.2    129
3.1    120
4.1    115
4.2     70
3.0     65
2.9     57
4.3     52
2.8     41
4.4     29
2.7     22
4.5     18
2.6      9
4.6      7
2.5      6
4.7      6
2.4      5
4.8      3
2.3      2
2.1      1
Name: Rating, dtype: int64

In [261]:
dataset['Rating']=dataset['Rating'].replace('NaN','0').astype(float)
#dataset['Rating'] = dataset['Rating'].astype(float)

In [262]:
test_dataset['Rating'] = test_dataset['Rating'].replace('NaN','0').astype(float)

In [263]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11094 entries, 0 to 11093
Columns: 192 entries, Restaurant to yogurt
dtypes: float64(186), int64(2), object(4)
memory usage: 16.3+ MB


In [264]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Columns: 175 entries, Restaurant to yogurt
dtypes: float64(170), int64(2), object(3)
memory usage: 3.7+ MB


In [265]:
dataset['Rating'].value_counts()

0.0    1963
3.7     869
3.6     846
3.5     818
3.8     800
3.9     749
3.4     718
3.3     675
4.0     614
3.2     511
4.1     459
3.1     411
3.0     302
4.2     272
4.3     247
2.9     199
2.8     157
4.4     142
4.5      78
2.7      76
2.6      42
4.6      41
4.7      36
2.5      27
4.8      13
2.4      13
4.9       8
2.3       6
2.1       1
2.2       1
Name: Rating, dtype: int64

In [266]:
test_dataset['Rating'].value_counts()

0.0    507
3.6    223
3.9    216
3.7    212
3.5    197
3.4    185
3.8    183
3.3    153
4.0    141
3.2    129
3.1    120
4.1    115
4.2     70
3.0     65
2.9     57
4.3     52
2.8     41
4.4     29
2.7     22
4.5     18
2.6      9
4.6      7
4.7      6
2.5      6
2.4      5
4.8      3
2.3      2
2.1      1
Name: Rating, dtype: int64

In [267]:
rating = dataset[dataset['Rating'] != 0.0]['Rating']

In [268]:
test_rating = test_dataset[test_dataset['Rating'] != 0.0]['Rating']

In [269]:
rating.value_counts()

3.7    869
3.6    846
3.5    818
3.8    800
3.9    749
3.4    718
3.3    675
4.0    614
3.2    511
4.1    459
3.1    411
3.0    302
4.2    272
4.3    247
2.9    199
2.8    157
4.4    142
4.5     78
2.7     76
2.6     42
4.6     41
4.7     36
2.5     27
2.4     13
4.8     13
4.9      8
2.3      6
2.1      1
2.2      1
Name: Rating, dtype: int64

In [270]:
test_rating.value_counts()

3.6    223
3.9    216
3.7    212
3.5    197
3.4    185
3.8    183
3.3    153
4.0    141
3.2    129
3.1    120
4.1    115
4.2     70
3.0     65
2.9     57
4.3     52
2.8     41
4.4     29
2.7     22
4.5     18
2.6      9
4.6      7
4.7      6
2.5      6
2.4      5
4.8      3
2.3      2
2.1      1
Name: Rating, dtype: int64

In [271]:
rating_mean = round(rating.mean(),1)
rating_mean

3.6

In [272]:
test_rating_mean = round(test_rating.mean(),1)
test_rating_mean

3.6

In [273]:
dataset['Rating'] = dataset['Rating'].replace(0.0, rating_mean)

In [274]:
test_dataset['Rating'] = test_dataset['Rating'].replace(0.0, test_rating_mean)

In [275]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Columns: 175 entries, Restaurant to yogurt
dtypes: float64(170), int64(2), object(3)
memory usage: 3.7+ MB


In [276]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11094 entries, 0 to 11093
Columns: 192 entries, Restaurant to yogurt
dtypes: float64(186), int64(2), object(4)
memory usage: 16.3+ MB


In [277]:
dataset['Rating'].value_counts().sum()

11094

In [278]:
test_dataset['Rating'].value_counts()#.sum()

3.6    730
3.9    216
3.7    212
3.5    197
3.4    185
3.8    183
3.3    153
4.0    141
3.2    129
3.1    120
4.1    115
4.2     70
3.0     65
2.9     57
4.3     52
2.8     41
4.4     29
2.7     22
4.5     18
2.6      9
4.6      7
4.7      6
2.5      6
2.4      5
4.8      3
2.3      2
2.1      1
Name: Rating, dtype: int64

In [279]:
test_dataset.shape

(2774, 175)

In [280]:
dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,administration,area,babarpur,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_6321,200,50,3.5,12,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,ID_2882,100,50,3.5,11,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_1595,150,50,3.6,99,30,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5929,250,99,3.7,176,95,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_6123,200,99,3.2,521,235,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [281]:
dataset['Votes'].value_counts()#.sum()

-       2074
4        248
6        200
7        182
9        181
        ... 
2264       1
487        1
3          1
866        1
662        1
Name: Votes, Length: 1103, dtype: int64

In [282]:
test_dataset['Votes'].value_counts()

-       542
7        60
9        57
6        55
5        51
       ... 
3404      1
294       1
463       1
798       1
4179      1
Name: Votes, Length: 580, dtype: int64

In [283]:
tr_votes=dataset[dataset['Votes']!='-']['Votes']
ts_votes = test_dataset[test_dataset['Votes']!='-']['Votes']
tr_votes

0         12
1         11
2         99
3        176
4        521
        ... 
11088     56
11089    326
11090     36
11091     45
11092     24
Name: Votes, Length: 9020, dtype: object

In [284]:
tr_votes_m=round(tr_votes.astype(int).mean())
tr_votes_m

245

In [285]:
ts_votes_m=round(ts_votes.astype(int).mean())
ts_votes_m

227

In [286]:
dataset['Votes'] = dataset['Votes'].replace(['-',245] ,'0')
dataset['Votes'] = dataset['Votes'].astype(int)

test_dataset['Votes'] = test_dataset['Votes'].replace(['-',227] , '0')
test_dataset['Votes'] = test_dataset['Votes'].astype(int)

In [287]:
dataset['Votes'].value_counts()#.sum()

0       2074
4        248
6        200
7        182
9        181
        ... 
3570       1
1507       1
1483       1
1419       1
4094       1
Name: Votes, Length: 1103, dtype: int64

In [288]:
test_dataset['Votes'].value_counts()

0       542
7        60
9        57
6        55
5        51
       ... 
203       1
207       1
211       1
213       1
1719      1
Name: Votes, Length: 580, dtype: int64

In [289]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Columns: 175 entries, Restaurant to yogurt
dtypes: float64(170), int64(3), object(2)
memory usage: 3.7+ MB


In [290]:
dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,administration,area,babarpur,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_6321,200,50,3.5,12,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,ID_2882,100,50,3.5,11,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_1595,150,50,3.6,99,30,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5929,250,99,3.7,176,95,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_6123,200,99,3.2,521,235,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [291]:
test_dataset['Reviews'].value_counts()

-      593
2      131
1      102
3       79
4       72
      ... 
326      1
361      1
248      1
177      1
646      1
Name: Reviews, Length: 392, dtype: int64

In [293]:
tr_rev=dataset[dataset['Reviews']!='-']['Reviews']
ts_rev = test_dataset[test_dataset['Reviews']!='-']['Reviews']
tr_rev

  result = method(y)


0          4
1          4
2         30
3         95
4        235
        ... 
11089    189
11090     16
11091     18
11092      9
11093      0
Name: Reviews, Length: 11094, dtype: int64

In [294]:
tr_rev_m=round(tr_rev.astype(int).mean())
tr_rev_m

98

In [295]:
ts_rev_m=round(ts_rev.astype(int).mean())
ts_rev_m

88

In [296]:
dataset['Reviews'] = dataset['Reviews'].replace(['-',98] , '0')
dataset['Reviews'] = dataset['Reviews'].astype(int)

test_dataset['Reviews'] = test_dataset['Reviews'].replace(['-',88] , '0')
test_dataset['Reviews'] = test_dataset['Reviews'].astype(int)

In [297]:
dataset['Reviews'].value_counts()

0       2326
2        420
3        387
1        381
4        356
        ... 
1412       1
1252       1
1172       1
1164       1
2031       1
Name: Reviews, Length: 760, dtype: int64

In [298]:
test_dataset['Reviews'].value_counts()

0       598
2       131
1       102
3        79
4        72
       ... 
3863      1
1636      1
1520      1
1464      1
1905      1
Name: Reviews, Length: 391, dtype: int64

In [299]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11094 entries, 0 to 11093
Columns: 192 entries, Restaurant to yogurt
dtypes: float64(186), int64(4), object(2)
memory usage: 16.3+ MB


In [300]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Columns: 175 entries, Restaurant to yogurt
dtypes: float64(170), int64(4), object(1)
memory usage: 3.7+ MB


In [301]:
dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,administration,area,babarpur,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_6321,200,50,3.5,12,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,ID_2882,100,50,3.5,11,4,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_1595,150,50,3.6,99,30,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5929,250,99,3.7,176,95,30 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_6123,200,99,3.2,521,235,65 minutes,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [302]:
dataset['Delivery_Time'].value_counts()

30 minutes     7406
45 minutes     2665
65 minutes      923
120 minutes      62
20 minutes       20
80 minutes       14
10 minutes        4
Name: Delivery_Time, dtype: int64

In [303]:
dataset['Delivery_Time'] = dataset['Delivery_Time'].str.replace('minutes','').astype(int)#map(lambda x: x.lstrip("").rstrip('minutes'))#str.slice(0,-8,1)
dataset['Delivery_Time']

0        30
1        30
2        65
3        30
4        65
         ..
11089    30
11090    30
11091    30
11092    30
11093    30
Name: Delivery_Time, Length: 11094, dtype: int64

In [304]:
#dataset.info()

In [305]:
#dataset.head()

In [306]:
dataset.head()

Unnamed: 0,Restaurant,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Delivery_Time,administration,area,babarpur,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,ID_6321,200,50,3.5,12,4,30,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,ID_2882,100,50,3.5,11,4,30,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_1595,150,50,3.6,99,30,65,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_5929,250,99,3.7,176,95,30,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_6123,200,99,3.2,521,235,65,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [307]:
missing_columns=set(dataset.columns) - set(test_dataset.columns)
missing_columns

{'Delivery_Time',
 'bohri',
 'bubble',
 'cantonese',
 'charcoal',
 'greek',
 'gujarati',
 'indonesian',
 'israeli',
 'konkan',
 'lankan',
 'mex',
 'nepalese',
 'portuguese',
 'sri',
 'tamil',
 'tex'}

In [308]:
for i in missing_columns:
    test_dataset[i]=0

In [309]:
test_dataset.shape

(2774, 192)

In [310]:
dataset.drop(['Restaurant'],axis=1,inplace = True)
test_dataset.drop(['Restaurant','Delivery_Time'],axis=1,inplace = True)

In [311]:
test_dataset.shape

(2774, 190)

In [312]:
X = dataset.drop(['Delivery_Time'], axis=1)
y = dataset['Delivery_Time'].values
X.head()

Unnamed: 0,Average_Cost,Minimum_Order,Rating,Votes,Reviews,administration,area,babarpur,bangalore,bazaar,...,sushi,tamil,tea,tex,thai,tibetan,turkish,vietnamese,wrap,yogurt
0,200,50,3.5,12,4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.577657,0.0
1,100,50,3.5,11,4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,150,50,3.6,99,30,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,250,99,3.7,176,95,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,200,99,3.2,521,235,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [313]:
y

array([30, 30, 65, ..., 30, 30, 30])

In [314]:
from sklearn.ensemble import RandomForestClassifier

In [315]:
model = RandomForestClassifier(n_estimators=500, random_state=0)#RandomForestRegressor(n_estimators=100, random_state=0)

In [316]:
model.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [317]:
y_pred = model.predict(test_dataset)#['Average_Cost'].reshape(-1,1))
#(test_dataset)

In [318]:
y_pred

array([30, 30, 30, ..., 30, 30, 30])

In [319]:
base = 5
Final = [(str(int(base * round(x/base)))+" "+'minutes') for x in y_pred]

In [320]:
Final

['30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '45 minutes',
 '45 minutes',
 '45 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '120 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '45 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minutes',
 '30 minu

In [321]:
'''
df_sub = pd.DataFrame(data = Final, columns = ['Delivery_Time'])
writer = pd.ExcelWriter('Delivery_Time_1.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer, sheet_name='Sheet1',index=False)
writer.save
'''

"\ndf_sub = pd.DataFrame(data = Final, columns = ['Delivery_Time'])\nwriter = pd.ExcelWriter('Delivery_Time_1.xlsx', engine='xlsxwriter')\ndf_sub.to_excel(writer, sheet_name='Sheet1',index=False)\nwriter.save\n"

In [322]:
final=pd.DataFrame(data = Final, columns = ['Delivery_Time'])

In [323]:
final.shape

(2774, 1)

In [324]:
final.to_excel ("final.xlsx",index=False)

In [325]:
final= pd.read_excel('final.xlsx')

In [326]:
#final=final.iloc[:,-1]

In [327]:
final

Unnamed: 0,Delivery_Time
0,30 minutes
1,30 minutes
2,30 minutes
3,30 minutes
4,30 minutes
...,...
2769,30 minutes
2770,30 minutes
2771,30 minutes
2772,30 minutes


In [328]:
final.to_excel ("Sheet1.xlsx", index=False)

In [329]:
final.shape

(2774, 1)