In [1]:
# Data Analysis
import pandas as pd
import numpy as np

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

# Data Prediction
from sklearn.preprocessing import LabelEncoder
import xgboost
from sklearn.metrics import mean_squared_error

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv('/Users/malarvizhis/Desktop/Participants_Data_Final/Data_Train.csv')
df_test = pd.read_csv('/Users/malarvizhis/Desktop/Participants_Data_Final/Data_Test.csv')

In [3]:

print(df_test.shape, df_train.shape)

((4231, 8), (12690, 9))


In [4]:
def null_values(df):
    
    sum_null = df.isnull().sum()
    total = df.isnull().count()
    percent_nullvalues = 100* sum_null / total 
    df_null = pd.DataFrame()
    df_null['Total'] = total
    df_null['Null_Count'] = sum_null
    df_null['Percent'] = round(percent_nullvalues,2)
    df_null = df_null.sort_values(by='Null_Count',ascending = False)
    df_null = df_null[df_null.Null_Count > 0]
    
    return(df_null)

In [6]:
num_columns = len(df_train.columns)
pd.set_option("display.max_columns", num_columns)
print(df_train.head(5))


num_columns_test = len(df_test.columns)
pd.set_option("display.max_columns", num_columns)
print(df_test.head(5))


               TITLE  RESTAURANT_ID  \
0      CASUAL DINING           9438   
1  CASUAL DINING,BAR          13198   
2      CASUAL DINING          10915   
3        QUICK BITES           6346   
4     DESSERT PARLOR          15387   

                                     CUISINES  \
0                 Malwani, Goan, North Indian   
1              Asian, Modern Indian, Japanese   
2  North Indian, Chinese, Biryani, Hyderabadi   
3                            Tibetan, Chinese   
4                                    Desserts   

                                     TIME     CITY        LOCALITY RATING  \
0  11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)    Thane  Dombivali East    3.6   
1                    6pm – 11pm (Mon-Sun)  Chennai       Ramapuram    4.2   
2     11am – 3:30pm, 7pm – 11pm (Mon-Sun)  Chennai      Saligramam    3.8   
3                 11:30am – 1am (Mon-Sun)   Mumbai     Bandra West    4.1   
4                    11am – 1am (Mon-Sun)   Mumbai     Lower Parel    3.8   

       

In [7]:
feature_names = df_train.columns.tolist()
feature_names.remove('TIME')
feature_names.remove('LOCALITY')

print(feature_names)

for column in feature_names:
    print(column)
    print(df_train[column].value_counts(dropna=False))

['TITLE', 'RESTAURANT_ID', 'CUISINES', 'CITY', 'RATING', 'VOTES', 'COST']
TITLE
QUICK BITES                     4218
CASUAL DINING                   3652
None                            1003
CAFÉ                             607
DESSERT PARLOR                   382
CASUAL DINING,BAR                349
FINE DINING                      308
BAKERY                           302
BEVERAGE SHOP                    156
BAR,CASUAL DINING                128
LOUNGE                            98
BAKERY,QUICK BITES                92
BAR                               91
FOOD COURT                        78
CASUAL DINING,CAFÉ                56
MESS                              53
KIOSK                             53
BAKERY,DESSERT PARLOR             51
PUB                               50
SWEET SHOP,QUICK BITES            48
DESSERT PARLOR,QUICK BITES        48
DHABA                             45
SWEET SHOP                        44
QUICK BITES,SWEET SHOP            35
CAFÉ,CASUAL DINING              

In [8]:
## Features for TEST DATA


feature_names_test= df_test.columns.tolist()
feature_names_test.remove('TIME')
feature_names_test.remove('LOCALITY')

print(feature_names_test)

for column in feature_names_test:
    print(column)
    print(df_test[column].value_counts(dropna=False))

['TITLE', 'RESTAURANT_ID', 'CUISINES', 'CITY', 'RATING', 'VOTES']
TITLE
QUICK BITES                   1416
CASUAL DINING                 1258
None                           284
CAFÉ                           223
DESSERT PARLOR                 126
CASUAL DINING,BAR              114
FINE DINING                     98
BAKERY                          90
BEVERAGE SHOP                   60
BAR,CASUAL DINING               47
LOUNGE                          38
BAKERY,QUICK BITES              34
BAR                             33
FOOD COURT                      32
SWEET SHOP                      23
SWEET SHOP,QUICK BITES          22
KIOSK                           20
PUB                             17
DHABA                           16
MESS                            15
CAFÉ,BAKERY                     14
QUICK BITES,SWEET SHOP          14
CAFÉ,QUICK BITES                12
BEVERAGE SHOP,QUICK BITES       12
PUB,CASUAL DINING               12
CAFÉ,CASUAL DINING              12
QUICK BITES,BAKERY

In [9]:
features = df_train.drop('COST', 1)
features = features.drop('LOCALITY', 1)
features = features.drop('TIME', 1)
features = features.drop('RESTAURANT_ID', 1)
labels = df_train['COST']

print(features.head())

               TITLE                                    CUISINES     CITY  \
0      CASUAL DINING                 Malwani, Goan, North Indian    Thane   
1  CASUAL DINING,BAR              Asian, Modern Indian, Japanese  Chennai   
2      CASUAL DINING  North Indian, Chinese, Biryani, Hyderabadi  Chennai   
3        QUICK BITES                            Tibetan, Chinese   Mumbai   
4     DESSERT PARLOR                                    Desserts   Mumbai   

  RATING      VOTES  
0    3.6   49 votes  
1    4.2   30 votes  
2    3.8  221 votes  
3    4.1   24 votes  
4    3.8  165 votes  


In [10]:
## Features and No label for TEST DATA

features_test= df_test
features_test= features_test.drop('LOCALITY', 1)
features_test= features_test.drop('TIME', 1)
features_test= features_test.drop('RESTAURANT_ID', 1)
#labels_test= df_test['COST']

print(features_test.head())

           TITLE                                      CUISINES       CITY  \
0  CASUAL DINING         North Indian, Chinese, Mughlai, Kebab      Noida   
1    QUICK BITES  South Indian, Fast Food, Pizza, North Indian     Mumbai   
2  CASUAL DINING       North Indian, Seafood, Biryani, Chinese     Mumbai   
3           None                                       Biryani  Faridabad   
4    QUICK BITES                          South Indian, Kerala      Kochi   

  RATING       VOTES  
0    4.3   564 votes  
1    4.2    61 votes  
2    3.8   350 votes  
3    3.8  1445 votes  
4    3.6    23 votes  


In [11]:
feature_names_ref = features.columns.tolist()


print(feature_names_ref)

for column in feature_names_ref:
    print(column)
    print(features[column].value_counts(dropna=False))

['TITLE', 'CUISINES', 'CITY', 'RATING', 'VOTES']
TITLE
QUICK BITES                     4218
CASUAL DINING                   3652
None                            1003
CAFÉ                             607
DESSERT PARLOR                   382
CASUAL DINING,BAR                349
FINE DINING                      308
BAKERY                           302
BEVERAGE SHOP                    156
BAR,CASUAL DINING                128
LOUNGE                            98
BAKERY,QUICK BITES                92
BAR                               91
FOOD COURT                        78
CASUAL DINING,CAFÉ                56
MESS                              53
KIOSK                             53
BAKERY,DESSERT PARLOR             51
PUB                               50
SWEET SHOP,QUICK BITES            48
DESSERT PARLOR,QUICK BITES        48
DHABA                             45
SWEET SHOP                        44
QUICK BITES,SWEET SHOP            35
CAFÉ,CASUAL DINING                35
BEVERAGE SHOP,QUICK 

In [12]:
## Data analysis for TEST DATA

feature_names_ref_test = features_test.columns.tolist()


print(feature_names_ref_test)

for column in feature_names_ref_test:
    print(column)
    print(features_test[column].value_counts(dropna=False))

['TITLE', 'CUISINES', 'CITY', 'RATING', 'VOTES']
TITLE
QUICK BITES                   1416
CASUAL DINING                 1258
None                           284
CAFÉ                           223
DESSERT PARLOR                 126
CASUAL DINING,BAR              114
FINE DINING                     98
BAKERY                          90
BEVERAGE SHOP                   60
BAR,CASUAL DINING               47
LOUNGE                          38
BAKERY,QUICK BITES              34
BAR                             33
FOOD COURT                      32
SWEET SHOP                      23
SWEET SHOP,QUICK BITES          22
KIOSK                           20
PUB                             17
DHABA                           16
MESS                            15
CAFÉ,BAKERY                     14
QUICK BITES,SWEET SHOP          14
CAFÉ,QUICK BITES                12
BEVERAGE SHOP,QUICK BITES       12
PUB,CASUAL DINING               12
CAFÉ,CASUAL DINING              12
QUICK BITES,BAKERY              11


In [13]:
features['VOTES'] = features.VOTES.str.replace('votes' , '')
features.VOTES = features.VOTES.astype(float)

print(features["VOTES"].value_counts(dropna=False))

NaN       1204
44.0        71
29.0        66
28.0        66
38.0        65
35.0        64
26.0        63
24.0        63
22.0        62
25.0        60
33.0        60
37.0        58
12.0        58
15.0        58
31.0        58
6.0         57
27.0        57
23.0        56
39.0        56
48.0        56
32.0        56
57.0        56
54.0        55
41.0        55
21.0        55
19.0        55
13.0        54
20.0        54
40.0        54
8.0         54
          ... 
2609.0       1
944.0        1
2944.0       1
1943.0       1
1629.0       1
3399.0       1
1767.0       1
2389.0       1
639.0        1
1858.0       1
2723.0       1
2542.0       1
758.0        1
1508.0       1
2500.0       1
2214.0       1
1914.0       1
996.0        1
1095.0       1
2711.0       1
2943.0       1
3334.0       1
1082.0       1
1926.0       1
1899.0       1
865.0        1
4980.0       1
1131.0       1
3369.0       1
2055.0       1
Name: VOTES, Length: 1848, dtype: int64


In [14]:

## Cleaning for TEST DATA

features_test['VOTES'] = features_test.VOTES.str.replace('votes' , '')
features_test.VOTES = features_test.VOTES.astype(float)

print(features_test["VOTES"].value_counts(dropna=False))

NaN       402
6.0        30
58.0       26
22.0       26
9.0        24
42.0       24
4.0        24
23.0       24
55.0       22
34.0       21
44.0       21
13.0       20
14.0       20
66.0       19
130.0      19
57.0       19
36.0       19
25.0       19
56.0       18
19.0       18
52.0       18
24.0       18
32.0       18
47.0       18
54.0       18
46.0       18
62.0       18
26.0       18
11.0       18
50.0       18
         ... 
1439.0      1
1317.0      1
1323.0      1
1028.0      1
908.0       1
1445.0      1
2903.0      1
1557.0      1
1903.0      1
2917.0      1
647.0       1
5163.0      1
2259.0      1
1503.0      1
442.0       1
709.0       1
2996.0      1
343.0       1
685.0       1
2235.0      1
2998.0      1
652.0       1
361.0       1
1018.0      1
1708.0      1
3162.0      1
427.0       1
996.0       1
828.0       1
1749.0      1
Name: VOTES, Length: 1137, dtype: int64


In [15]:
features['RATING'] = features.RATING.str.replace('NEW' , '0.0')
features['RATING'] = features.RATING.str.replace('-' , '0.0')
#features.RATING = features.RATING.astype(float)

print(features["RATING"].value_counts(dropna=False))

print(type(features['RATING'].astype(float)))

print(type(features['VOTES']))

3.9    1238
0.0    1202
3.8    1190
4.0    1099
3.7    1086
3.6     951
4.1     936
3.5     771
4.2     723
3.4     575
4.3     556
3.3     365
4.4     362
3.2     266
4.5     199
2.9     186
3.1     186
3.0     170
2.8     146
4.6     141
2.7      89
4.7      69
2.6      61
2.5      35
4.8      32
4.9      22
2.4      15
2.3      10
2.1       3
2.2       2
2.0       2
NaN       2
Name: RATING, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [16]:
## Cleaning of TEST DATA - RATING FIELD

features_test['RATING'] = features_test.RATING.str.replace('NEW' , '0.0')
features_test['RATING'] = features_test.RATING.str.replace('-' , '0.0')
#features.RATING = features.RATING.astype(float)

print(features_test["RATING"].value_counts(dropna=False))

print(type(features_test['RATING'].astype(float)))

print(type(features_test['VOTES']))

3.8    414
3.9    405
0.0    400
4.0    399
3.7    351
3.6    310
4.1    281
4.2    245
3.5    217
3.4    202
4.3    184
4.4    122
3.3    120
3.2     83
3.1     77
4.5     75
3.0     59
2.9     56
2.8     45
4.6     43
2.7     40
2.6     24
4.7     21
4.8     17
2.5     16
2.4      9
4.9      6
2.3      5
2.2      2
NaN      2
2.1      1
Name: RATING, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [17]:
print(features.shape)
print(features.head())
print(features['VOTES'].value_counts(dropna=False))
print(features['RATING'].value_counts(dropna=False))



features_selected = features[['VOTES','RATING']]

print(type(features_selected))

(12690, 5)
               TITLE                                    CUISINES     CITY  \
0      CASUAL DINING                 Malwani, Goan, North Indian    Thane   
1  CASUAL DINING,BAR              Asian, Modern Indian, Japanese  Chennai   
2      CASUAL DINING  North Indian, Chinese, Biryani, Hyderabadi  Chennai   
3        QUICK BITES                            Tibetan, Chinese   Mumbai   
4     DESSERT PARLOR                                    Desserts   Mumbai   

  RATING  VOTES  
0    3.6   49.0  
1    4.2   30.0  
2    3.8  221.0  
3    4.1   24.0  
4    3.8  165.0  
NaN       1204
44.0        71
29.0        66
28.0        66
38.0        65
35.0        64
26.0        63
24.0        63
22.0        62
25.0        60
33.0        60
37.0        58
12.0        58
15.0        58
31.0        58
6.0         57
27.0        57
23.0        56
39.0        56
48.0        56
32.0        56
57.0        56
54.0        55
41.0        55
21.0        55
19.0        55
13.0        54
20.0        5

In [18]:
## TEST DATA

print(features_test.shape)
print(features_test.head())
print(features_test['VOTES'].value_counts(dropna=False))
print(features_test['RATING'].value_counts(dropna=False))



features_selected_test= features_test[['VOTES','RATING']]

print(type(features_selected_test))

(4231, 5)
           TITLE                                      CUISINES       CITY  \
0  CASUAL DINING         North Indian, Chinese, Mughlai, Kebab      Noida   
1    QUICK BITES  South Indian, Fast Food, Pizza, North Indian     Mumbai   
2  CASUAL DINING       North Indian, Seafood, Biryani, Chinese     Mumbai   
3           None                                       Biryani  Faridabad   
4    QUICK BITES                          South Indian, Kerala      Kochi   

  RATING   VOTES  
0    4.3   564.0  
1    4.2    61.0  
2    3.8   350.0  
3    3.8  1445.0  
4    3.6    23.0  
NaN       402
6.0        30
58.0       26
22.0       26
9.0        24
42.0       24
4.0        24
23.0       24
55.0       22
34.0       21
44.0       21
13.0       20
14.0       20
66.0       19
130.0      19
57.0       19
36.0       19
25.0       19
56.0       18
19.0       18
52.0       18
24.0       18
32.0       18
47.0       18
54.0       18
46.0       18
62.0       18
26.0       18
11.0       18
50.0   

In [19]:
from sklearn.preprocessing import Imputer

features_selected = features[['VOTES','RATING']]

print(features_selected.shape)

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(features_selected) 
features_selected = imputer.transform(features_selected)

(12690, 2)


In [20]:
## To remove NaN for TEST DATA

from sklearn.preprocessing import Imputer

features_selected_test = features_test[['VOTES','RATING']]

print(features_selected_test.shape)

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(features_selected_test) 
features_selected_test = imputer.transform(features_selected_test)

(4231, 2)


In [21]:
## To check whether the NaN replaced or not

features_selected.shape
import math
for i in range(0,len(features_selected)):
    
    bool_val = math.isnan(features_selected[i][1])
    if bool_val == True:
        print(bool_val)


In [22]:
## To check whether the NaN replaced or not - TEST DATA

features_selected_test.shape
import math
for i in range(0,len(features_selected_test)):
    
    bool_val = math.isnan(features_selected_test[i][1])
    if bool_val == True:
        print(bool_val)

In [23]:
df_value = pd.DataFrame(data=features_selected,columns = ['VOTES','RATING'])

In [24]:
# TEST DATA
df_value_test = pd.DataFrame(data=features_selected_test,columns = ['VOTES','RATING'])

In [25]:
print(df_value.shape)
print(df_value_test.shape)

(12690, 2)
(4231, 2)


In [26]:
print(df_value.head())
print(df_value_test.head())


   VOTES  RATING
0   49.0     3.6
1   30.0     4.2
2  221.0     3.8
3   24.0     4.1
4  165.0     3.8
    VOTES  RATING
0   564.0     4.3
1    61.0     4.2
2   350.0     3.8
3  1445.0     3.8
4    23.0     3.6


In [27]:
features['VOTES'] = df_value['VOTES']
features['RATING']=df_value["RATING"]

In [28]:
## TEST DATA

features_test['VOTES'] = df_value_test['VOTES']
features_test['RATING']=df_value_test["RATING"]

In [29]:
print(features.shape)
print(features.head())

print(features_test.shape)
print(features_test.head())

(12690, 5)
               TITLE                                    CUISINES     CITY  \
0      CASUAL DINING                 Malwani, Goan, North Indian    Thane   
1  CASUAL DINING,BAR              Asian, Modern Indian, Japanese  Chennai   
2      CASUAL DINING  North Indian, Chinese, Biryani, Hyderabadi  Chennai   
3        QUICK BITES                            Tibetan, Chinese   Mumbai   
4     DESSERT PARLOR                                    Desserts   Mumbai   

   RATING  VOTES  
0     3.6   49.0  
1     4.2   30.0  
2     3.8  221.0  
3     4.1   24.0  
4     3.8  165.0  
(4231, 5)
           TITLE                                      CUISINES       CITY  \
0  CASUAL DINING         North Indian, Chinese, Mughlai, Kebab      Noida   
1    QUICK BITES  South Indian, Fast Food, Pizza, North Indian     Mumbai   
2  CASUAL DINING       North Indian, Seafood, Biryani, Chinese     Mumbai   
3           None                                       Biryani  Faridabad   
4    QUICK BITES 

In [30]:
for column in feature_names_ref:
    print(column)
    print(features[column].value_counts(dropna=False))

TITLE
QUICK BITES                     4218
CASUAL DINING                   3652
None                            1003
CAFÉ                             607
DESSERT PARLOR                   382
CASUAL DINING,BAR                349
FINE DINING                      308
BAKERY                           302
BEVERAGE SHOP                    156
BAR,CASUAL DINING                128
LOUNGE                            98
BAKERY,QUICK BITES                92
BAR                               91
FOOD COURT                        78
CASUAL DINING,CAFÉ                56
MESS                              53
KIOSK                             53
BAKERY,DESSERT PARLOR             51
PUB                               50
SWEET SHOP,QUICK BITES            48
DESSERT PARLOR,QUICK BITES        48
DHABA                             45
SWEET SHOP                        44
QUICK BITES,SWEET SHOP            35
CAFÉ,CASUAL DINING                35
BEVERAGE SHOP,QUICK BITES         33
FINE DINING,BAR                 

In [31]:
## TEST DATA

for column in feature_names_ref:
    print(column)
    print(features[column].value_counts(dropna=False))

TITLE
QUICK BITES                     4218
CASUAL DINING                   3652
None                            1003
CAFÉ                             607
DESSERT PARLOR                   382
CASUAL DINING,BAR                349
FINE DINING                      308
BAKERY                           302
BEVERAGE SHOP                    156
BAR,CASUAL DINING                128
LOUNGE                            98
BAKERY,QUICK BITES                92
BAR                               91
FOOD COURT                        78
CASUAL DINING,CAFÉ                56
MESS                              53
KIOSK                             53
BAKERY,DESSERT PARLOR             51
PUB                               50
SWEET SHOP,QUICK BITES            48
DESSERT PARLOR,QUICK BITES        48
DHABA                             45
SWEET SHOP                        44
QUICK BITES,SWEET SHOP            35
CAFÉ,CASUAL DINING                35
BEVERAGE SHOP,QUICK BITES         33
FINE DINING,BAR                 

In [34]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(features['TITLE'])
label_title = label_encoder.transform(features['TITLE'])
print(label_title)


[31 33 31 ... 71 31 31]


In [32]:
## TEST DATA

print(features_test['TITLE'].value_counts(dropna=False))
features_test["CITY"].isnull().values.any()

features_test["CITY"].fillna("No CITY", inplace = True) 

QUICK BITES                   1416
CASUAL DINING                 1258
None                           284
CAFÉ                           223
DESSERT PARLOR                 126
CASUAL DINING,BAR              114
FINE DINING                     98
BAKERY                          90
BEVERAGE SHOP                   60
BAR,CASUAL DINING               47
LOUNGE                          38
BAKERY,QUICK BITES              34
BAR                             33
FOOD COURT                      32
SWEET SHOP                      23
SWEET SHOP,QUICK BITES          22
KIOSK                           20
PUB                             17
DHABA                           16
MESS                            15
CAFÉ,BAKERY                     14
QUICK BITES,SWEET SHOP          14
CAFÉ,QUICK BITES                12
BEVERAGE SHOP,QUICK BITES       12
PUB,CASUAL DINING               12
CAFÉ,CASUAL DINING              12
QUICK BITES,BAKERY              11
BAKERY,DESSERT PARLOR           10
QUICK BITES,DESSERT 

In [35]:
## TEST DATA
from sklearn import preprocessing
label_encoder_test = preprocessing.LabelEncoder()
label_encoder_test.fit(features_test['TITLE'])
label_title_test = label_encoder_test.transform(features_test['TITLE'])
print(label_title_test)

[23 75 23 ... 75 75 75]


In [36]:
label_encoder1 = preprocessing.LabelEncoder()
label_encoder1.fit(features['CUISINES'])
label_cuisine = label_encoder1.transform(features['CUISINES'])
print(label_cuisine)


[2627  289 2976 ... 2088 3506 2946]


In [37]:
## TEST DATA
label_encoder1_test = preprocessing.LabelEncoder()
label_encoder1_test.fit(features_test['CUISINES'])
label_cuisine_test= label_encoder1_test.transform(features_test['CUISINES'])
print(label_cuisine_test)

[1293 1632 1455 ...  679 1541 1145]


In [38]:
label_encoder2 = preprocessing.LabelEncoder()
label_encoder2.fit(features['CITY'])
label_city = label_encoder2.transform(features['CITY'])
print(label_city)

[321  67  67 ... 239  67 131]


In [39]:
## TEST DATA

label_encoder2_test = preprocessing.LabelEncoder()
label_encoder2_test.fit(features_test['CITY'])
label_city_test = label_encoder2_test.transform(features_test['CITY'])
print(label_city_test)

[111  96  96 ...  41  78   8]


In [33]:
print(features['TITLE'].value_counts(dropna=False))
features["CITY"].isnull().values.any()

features["CITY"].fillna("No CITY", inplace = True) 

QUICK BITES                     4218
CASUAL DINING                   3652
None                            1003
CAFÉ                             607
DESSERT PARLOR                   382
CASUAL DINING,BAR                349
FINE DINING                      308
BAKERY                           302
BEVERAGE SHOP                    156
BAR,CASUAL DINING                128
LOUNGE                            98
BAKERY,QUICK BITES                92
BAR                               91
FOOD COURT                        78
CASUAL DINING,CAFÉ                56
MESS                              53
KIOSK                             53
BAKERY,DESSERT PARLOR             51
PUB                               50
SWEET SHOP,QUICK BITES            48
DESSERT PARLOR,QUICK BITES        48
DHABA                             45
SWEET SHOP                        44
QUICK BITES,SWEET SHOP            35
CAFÉ,CASUAL DINING                35
BEVERAGE SHOP,QUICK BITES         33
FINE DINING,BAR                   32
C

In [40]:
features_final = features

In [41]:
## TEST DATA
features_final_test = features_test

In [42]:
features_final['TITLE'] = label_title

In [43]:
features_final['CITY'] = label_city

In [44]:
features_final['CUISINES'] = label_cuisine

In [45]:
## TEST DATA

features_final_test['TITLE'] = label_title_test
features_final_test['CITY'] = label_city_test
features_final_test['CUISINES'] = label_cuisine_test

In [46]:
print(features_final.shape)
print(features_final.head())

print(features_final_test.shape)
print(features_final_test.head())

(12690, 5)
   TITLE  CUISINES  CITY  RATING  VOTES
0     31      2627   321     3.6   49.0
1     33       289    67     4.2   30.0
2     31      2976    67     3.8  221.0
3     95      4130   222     4.1   24.0
4     50      1766   222     3.8  165.0
(4231, 5)
   TITLE  CUISINES  CITY  RATING   VOTES
0     23      1293   111     4.3   564.0
1     75      1632    96     4.2    61.0
2     23      1455    96     3.8   350.0
3     66       208    41     3.8  1445.0
4     75      1637    78     3.6    23.0


In [47]:
print(features.isnull().values.any())
print(features_test.isnull().values.any())

False
False


In [48]:
print(features_final.shape)
print(features_final.head())
print(labels.shape)
print(labels.head())

(12690, 5)
   TITLE  CUISINES  CITY  RATING  VOTES
0     31      2627   321     3.6   49.0
1     33       289    67     4.2   30.0
2     31      2976    67     3.8  221.0
3     95      4130   222     4.1   24.0
4     50      1766   222     3.8  165.0
(12690,)
0    1200
1    1500
2     800
3     800
4     300
Name: COST, dtype: int64


In [49]:
## Check with converting to int

features_final_check1 = features_final
features_final_check1["TITLE"] = features_final_check1["TITLE"].astype(int)
features_final_check1["RATING"] = features_final_check1["RATING"].astype(int)
features_final_check1["VOTES"] = features_final_check1["VOTES"].astype(int)
features_final_check1["CUISINES"] = features_final_check1["CUISINES"].astype(int)
features_final_check1["CITY"] = features_final_check1["CITY"].astype(int)

In [171]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_final, labels, test_size=0.2, random_state=123)

In [50]:
## CHECKING

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_final_check1, labels, test_size=0.2, random_state=123)

In [51]:

from sklearn.ensemble import RandomForestRegressor
rf_exp = RandomForestRegressor(n_estimators= 1000, random_state=100)
rf_exp.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=100, verbose=0, warm_start=False)

In [55]:
predictions = rf_exp.predict(X_test)
# Performance metrics
errors = abs(predictions - y_test)
print('Metrics for Random Forest Trained on Expanded Data')
print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')

Metrics for Random Forest Trained on Expanded Data
Average absolute error: 188.68 degrees.


In [56]:
# Calculate mean absolute percentage error (MAPE)
mape = np.mean(100 * (errors / y_test))

# Calculate and display accuracy
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 64.26 %.


In [None]:
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)

In [147]:
scores

[0.718999615267601,
 0.6876612118811363,
 0.7190646415697068,
 0.7143750340469461,
 0.7116763722608168,
 0.7107206017948537,
 0.7118265506398005,
 0.7169198147213343,
 0.7128082332704118,
 0.7173533193769854,
 0.7142842765614923,
 0.7141772107271604,
 0.7114773203061922,
 0.7100486905187411,
 0.7104432584531596,
 0.7149809249503328,
 0.7198063317600876,
 0.7166395172371443,
 0.7119788486403493]

In [52]:
X_test_features = features_final_test
predictions_test = rf_exp.predict(X_test_features)

In [53]:
## Check with int
features_final_test_check1 = features_final_test
features_final_test_check1["TITLE"] = features_final_test_check1["TITLE"].astype(int)
features_final_test_check1["RATING"] = features_final_test_check1["RATING"].astype(int)
features_final_test_check1["VOTES"] = features_final_test_check1["VOTES"].astype(int)
features_final_test_check1["CUISINES"] = features_final_test_check1["CUISINES"].astype(int)
features_final_test_check1["CITY"] = features_final_test_check1["CITY"].astype(int)
X_test_features_check1 = features_final_test_check1
predictions_test = rf_exp.predict(X_test_features_check1)

In [54]:
print(predictions_test.shape)

print(X_test_features.shape)
print(X_test.shape)
print(y_test.shape)

(4231,)
(4231, 5)
(2538, 5)
(2538,)


In [55]:
print(df_test.shape)

(4231, 8)


In [56]:
print(predictions_test)

[ 814.95 2153.5   630.6  ... 2611.85 1651.05 1891.  ]


In [63]:
result_cost = pd.DataFrame(data=predictions_test,columns=["COST"])

result_cost['COST'] = result_cost['COST'].astype(int)

In [64]:
print(result_cost.shape)
print(result_cost.head())

(4231, 1)
   COST
0   814
1  2153
2   630
3   581
4  2213


In [65]:
print(features_final_test.head())

   TITLE  CUISINES  CITY  RATING  VOTES
0     23      1293   111       4    564
1     75      1632    96       4     61
2     23      1455    96       3    350
3     66       208    41       3   1445
4     75      1637    78       3     23


In [212]:
print(features_final_test.isnull().values.any())

False


In [213]:
result_cost.to_csv("./Results_check_v4.csv", sep=',',index=False)

In [214]:
#####################################

In [51]:
#for custom transformer
from sklearn.base import BaseEstimator, TransformerMixin

# for creating pipeline
from sklearn.pipeline import Pipeline, FeatureUnion

# # for cross validation
# from sklearn.cross_validation import train_test_split
# from sklearn.cross_validation import train_test_split
# # from sklearn import cross_validation
# # from sklearn.model_selection import cross_val_predict, cross_val_score

# for various metrics and reporting
from sklearn import metrics 
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# feature selection
from sklearn.feature_selection import SelectFromModel

# xgboost library
from xgboost import XGBRegressor

from sklearn import ensemble
# plot feature importance
from xgboost import plot_importance, plot_tree

In [52]:

# fit model no training data
# model = XGBRegressor(n_estimators = 1000)
model = XGBRegressor(colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 7, alpha = 10, n_estimators = 100)

model.fit(X_train, y_train)


XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [52]:

model1 = ensemble.GradientBoostingRegressor(n_estimators = 100)

model1.fit(X_train, y_train)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [53]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [54]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 0.16%


In [55]:
# Calculate mean absolute percentage error (MAPE)
errors = abs(predictions - y_test)
print('Metrics for Random Forest Trained on Expanded Data')
print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')

mape = np.mean(100 * (errors / y_test))

# Calculate and display accuracy
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

Metrics for Random Forest Trained on Expanded Data
Average absolute error: 177.0 degrees.
Accuracy: 65.55 %.


In [57]:
X_test_features_xgb = features_final_test
predictions_test = model.predict(X_test_features_xgb)

In [58]:
result_cost = pd.DataFrame(data=predictions_test,columns=["COST"])

result_cost['COST'] = result_cost['COST'].astype(int)

In [59]:
result_cost.to_csv("./Results_check_v15_xgb.csv", sep=',',index=False)