In [1]:
import pandas as pd
import numpy as np
import itertools
import scipy
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')

In [3]:
#combining the train and test data
combined_data = train.append(test, ignore_index=True,sort=False)
combined_data.drop('Delivery_Time', inplace=True,axis=1)

In [4]:
combined_data.head()

Unnamed: 0,Restaurant,Location,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews
0,ID_6321,"FTI College, Law College Road, Pune","Fast Food, Rolls, Burger, Salad, Wraps",₹200,₹50,3.5,12,4
1,ID_2882,"Sector 3, Marathalli","Ice Cream, Desserts",₹100,₹50,3.5,11,4
2,ID_1595,Mumbai Central,"Italian, Street Food, Fast Food",₹150,₹50,3.6,99,30
3,ID_5929,"Sector 1, Noida","Mughlai, North Indian, Chinese",₹250,₹99,3.7,176,95
4,ID_6123,"Rmz Centennial, I Gate, Whitefield","Cafe, Beverages",₹200,₹99,3.2,521,235


In [5]:
combined_data.isnull().sum()

Restaurant       0
Location         0
Cuisines         0
Average_Cost     0
Minimum_Order    0
Rating           0
Votes            0
Reviews          0
dtype: int64

## FEATURE ENGINEERING

In [6]:
#showing thr unique values of the average cost
combined_data['Average_Cost'].unique()

array(['₹200', '₹100', '₹150', '₹250', '₹650', '₹350', '₹800', '₹50',
       '₹400', '₹600', '₹300', '₹750', '₹450', '₹550', '₹1,000', '₹500',
       '₹900', '₹1,200', '₹950', '₹850', '₹700', '₹1,150', 'for',
       '₹1,100', '₹1,400', '₹2,050'], dtype=object)

In [7]:
#removing the currency sign "₹" from all the average cost values
combined_data['Average_Cost'] = combined_data['Average_Cost'].str.replace("[^0-9]","")
#removing all white spaces left using the strip function
combined_data['Average_Cost'] = combined_data['Average_Cost'].str.strip()
#converting the data type to numeric
combined_data['Average_Cost']=pd.to_numeric(combined_data['Average_Cost'])
#filling all NAN values with -999
combined_data['Average_Cost'] = combined_data['Average_Cost'].fillna(-999).astype(int)

In [8]:
combined_data['Average_Cost'].unique()

array([ 200,  100,  150,  250,  650,  350,  800,   50,  400,  600,  300,
        750,  450,  550, 1000,  500,  900, 1200,  950,  850,  700, 1150,
       -999, 1100, 1400, 2050])

In [9]:
#repeating the same process to the minimum_order as the average cost
combined_data['Minimum_Order'] = combined_data['Minimum_Order'].str.replace("[^0-9]","")
combined_data['Minimum_Order'] = combined_data['Minimum_Order'].str.strip()
combined_data['Minimum_Order']=pd.to_numeric(combined_data['Minimum_Order'])
combined_data['Minimum_Order'] = combined_data['Minimum_Order'].fillna(-999).astype(int)

In [10]:
combined_data['Rating'].unique()

array(['3.5', '3.6', '3.7', '3.2', '3.8', '4.0', '3.9', '4.2', '-', '2.8',
       '3.0', '3.3', '3.1', '4.7', '3.4', '4.1', 'NEW', '2.9', '4.6',
       '4.3', '2.6', '4.5', '4.4', '4.8', '2.4', '2.7', '2.5',
       'Opening Soon', '2.2', '4.9', '2.3', '2.1', 'Temporarily Closed'],
      dtype=object)

In [11]:
combined_data.isnull().sum()

Restaurant       0
Location         0
Cuisines         0
Average_Cost     0
Minimum_Order    0
Rating           0
Votes            0
Reviews          0
dtype: int64

In [12]:
#replacing all non-numeric values with -999
combined_data.Rating = combined_data.Rating.replace("NEW",-999)
combined_data.Rating = combined_data.Rating.replace("-",-999)
combined_data.Rating = combined_data.Rating.replace("Opening Soon",-999)
combined_data.Rating = combined_data.Rating.replace("Temporarily Closed",-999)
combined_data.Rating = combined_data.Rating.astype('float')

In [13]:
combined_data.isnull().sum()

Restaurant       0
Location         0
Cuisines         0
Average_Cost     0
Minimum_Order    0
Rating           0
Votes            0
Reviews          0
dtype: int64

In [14]:
combined_data['Rating'].unique()

array([   3.5,    3.6,    3.7,    3.2,    3.8,    4. ,    3.9,    4.2,
       -999. ,    2.8,    3. ,    3.3,    3.1,    4.7,    3.4,    4.1,
          2.9,    4.6,    4.3,    2.6,    4.5,    4.4,    4.8,    2.4,
          2.7,    2.5,    2.2,    4.9,    2.3,    2.1])

In [15]:
combined_data['Votes'].unique()

array(['12', '11', '99', ..., '518', '666', '605'], dtype=object)

In [16]:
#finding out the non-numerica characters in votes
arr1 = combined_data['Votes'].unique()
arr2 =[]
for i in range(len(arr1)):
    if(arr1[i].isnumeric() ==False):
        arr2.append(arr1[i])
print(arr2)

['-']


In [17]:
#replacing all non-numeric values in "Votes" with -999
combined_data.Votes = combined_data.Votes.replace("-",-999)
combined_data.Votes = combined_data.Votes.astype('float')

In [18]:
#replacing all non-numeric values in "Reviews" with -999
combined_data.Reviews = combined_data.Reviews.replace("-",-999)
combined_data.Reviews = combined_data.Reviews.astype('float')

In [19]:
#Creating a new feature "city" by splitting each location string into an array and getting the last element
#in the array which is the name of the city  
combined_data['city'] = combined_data['Location'].apply(lambda x : np.char.strip(x.split(','))[-1])

In [20]:
#getting all the unique city names
combined_data['city'].unique()

array(['Pune', 'Marathalli', 'Mumbai Central', 'Noida', 'Whitefield',
       'Delhi University-GTB Nagar', 'Maharashtra', 'Timarpur', 'Kolkata',
       'Mumbai CST Area', 'Pune University', 'Bangalore', 'Gurgoan',
       'Begumpet', 'Majestic', 'India Gate', 'Hyderabad', 'Gurgaon',
       'Delhi Cantt.', 'Mumbai', 'Delhi', 'Electronic City'], dtype=object)

In [21]:
#np.char.strip(combined_data['Location'][4].split(',')[-1])

In [22]:
#research about the cities were done to make sure all locations were in the appropriate city name.
actual_city = {'Noida' : 'Noida', 
               'Gurgaon' : 'Gurgoan',
               'Gurgoan' : 'Gurgoan',
               'Mumbai CST Area' : 'Mumbai',
               'Mumbai Central' : 'Mumbai',
               'Mumbai' : 'Mumbai',
               'Pune' : 'Pune',
               'Maharashtra' : 'Pune',
               'Pune University' : 'Pune',
               'Timarpur' : 'Delhi',
               'Delhi' : 'Delhi',
               'Delhi Cantt.' : 'Delhi',
               'Delhi University-GTB Nagar' : 'Delhi',
               'India Gate' : 'Delhi',
               'Whitefield' : 'Bangalore', 
               'Marathalli' : 'Bangalore',
               'Majestic' : 'Bangalore',
               'Bangalore' : 'Bangalore',
               'Electronic City' : 'Bangalore',
               'Hyderabad' : 'Hyderabad',
               'Begumpet' : 'Hyderabad',
               'Kolkata' : 'Kolkata'
               }
#using the applymap() function all the cities were correctly replaced.
combined_data['city'] = combined_data[['city']].applymap(actual_city.get)

In [23]:
#dropping the location column as it is not needed anymore.
combined_data = combined_data.drop("Location", axis=1)
combined_data.head(3)

Unnamed: 0,Restaurant,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,city
0,ID_6321,"Fast Food, Rolls, Burger, Salad, Wraps",200,50,3.5,12.0,4.0,Pune
1,ID_2882,"Ice Cream, Desserts",100,50,3.5,11.0,4.0,Bangalore
2,ID_1595,"Italian, Street Food, Fast Food",150,50,3.6,99.0,30.0,Mumbai


In [24]:
#After the mapping the city names were trimmed down to 7 cities.
combined_data['city'].unique()

array(['Pune', 'Bangalore', 'Mumbai', 'Noida', 'Delhi', 'Kolkata',
       'Gurgoan', 'Hyderabad'], dtype=object)

In [25]:
cuisines = list(combined_data.Cuisines.apply(lambda x : x.split(",")))
cuisines

[['Fast Food', ' Rolls', ' Burger', ' Salad', ' Wraps'],
 ['Ice Cream', ' Desserts'],
 ['Italian', ' Street Food', ' Fast Food'],
 ['Mughlai', ' North Indian', ' Chinese'],
 ['Cafe', ' Beverages'],
 ['South Indian', ' North Indian', ' Chinese'],
 ['Beverages', ' Fast Food'],
 ['Chinese', ' Thai', ' Asian'],
 ['Mithai', ' Street Food'],
 ['Fast Food', ' North Indian', ' Rolls', ' Chinese', ' Momos', ' Mughlai'],
 ['North Indian', ' Chinese', ' Mughlai'],
 ['Chinese', ' Thai', ' Indonesian', ' Italian'],
 ['Burger', ' Fast Food', ' Desserts', ' Beverages'],
 ['Beverages'],
 ['Fast Food'],
 ['Biryani', ' Mughlai'],
 ['Italian'],
 ['Chinese', ' North Indian', ' South Indian', ' Fast Food'],
 ['Kerala', ' Bihari'],
 ['South Indian'],
 ['Chinese', ' North Indian', ' Rolls', ' Momos'],
 ['North Indian', ' Rolls'],
 ['North Indian', ' Chinese'],
 ['Biryani', ' North Indian'],
 ['North Indian', ' Chinese', ' Fast Food'],
 ['Desserts', ' Ice Cream'],
 ['North Indian', ' Chinese'],
 ['Fast Food',

In [26]:
merged = list(itertools.chain.from_iterable(cuisines))
merged

['Fast Food',
 ' Rolls',
 ' Burger',
 ' Salad',
 ' Wraps',
 'Ice Cream',
 ' Desserts',
 'Italian',
 ' Street Food',
 ' Fast Food',
 'Mughlai',
 ' North Indian',
 ' Chinese',
 'Cafe',
 ' Beverages',
 'South Indian',
 ' North Indian',
 ' Chinese',
 'Beverages',
 ' Fast Food',
 'Chinese',
 ' Thai',
 ' Asian',
 'Mithai',
 ' Street Food',
 'Fast Food',
 ' North Indian',
 ' Rolls',
 ' Chinese',
 ' Momos',
 ' Mughlai',
 'North Indian',
 ' Chinese',
 ' Mughlai',
 'Chinese',
 ' Thai',
 ' Indonesian',
 ' Italian',
 'Burger',
 ' Fast Food',
 ' Desserts',
 ' Beverages',
 'Beverages',
 'Fast Food',
 'Biryani',
 ' Mughlai',
 'Italian',
 'Chinese',
 ' North Indian',
 ' South Indian',
 ' Fast Food',
 'Kerala',
 ' Bihari',
 'South Indian',
 'Chinese',
 ' North Indian',
 ' Rolls',
 ' Momos',
 'North Indian',
 ' Rolls',
 'North Indian',
 ' Chinese',
 'Biryani',
 ' North Indian',
 'North Indian',
 ' Chinese',
 ' Fast Food',
 'Desserts',
 ' Ice Cream',
 'North Indian',
 ' Chinese',
 'Fast Food',
 ' South I

In [27]:
merged = np.sort(np.unique(np.char.lstrip(merged)))
merged

array(['Afghan', 'African', 'American', 'Andhra', 'Arabian', 'Asian',
       'Assamese', 'Awadhi', 'BBQ', 'Bakery', 'Bangladeshi', 'Bar Food',
       'Belgian', 'Bengali', 'Beverages', 'Bihari', 'Biryani', 'Bohri',
       'Brazilian', 'Bubble Tea', 'Burger', 'Burmese', 'Cafe',
       'Cantonese', 'Charcoal Chicken', 'Chettinad', 'Chinese', 'Coffee',
       'Continental', 'Desserts', 'European', 'Fast Food', 'Finger Food',
       'French', 'Frozen Yogurt', 'German', 'Goan', 'Greek', 'Gujarati',
       'Healthy Food', 'Hot dogs', 'Hyderabadi', 'Ice Cream', 'Indian',
       'Indonesian', 'Iranian', 'Israeli', 'Italian', 'Japanese',
       'Juices', 'Kashmiri', 'Kebab', 'Kerala', 'Konkan', 'Korean',
       'Lebanese', 'Lucknowi', 'Maharashtrian', 'Malaysian', 'Malwani',
       'Mangalorean', 'Mediterranean', 'Mexican', 'Middle Eastern',
       'Mishti', 'Mithai', 'Modern Indian', 'Momos', 'Mughlai', 'Naga',
       'Nepalese', 'North Eastern', 'North Indian', 'Odia', 'Paan',
       'Parsi',

In [28]:
#total number of cuisines
len(merged)

101

In [29]:
#creating a dataframe for the cuisines with all values set to zero
cuisines_DF = pd.DataFrame(0, index=np.arange(len(combined_data)), columns = merged)
print(cuisines_DF.shape)
cuisines_DF.head(3)

(13868, 101)


Unnamed: 0,Afghan,African,American,Andhra,Arabian,Asian,Assamese,Awadhi,BBQ,Bakery,...,Street Food,Sushi,Tamil,Tea,Tex-Mex,Thai,Tibetan,Turkish,Vietnamese,Wraps
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
combined_data.Cuisines[0:(1)].apply(lambda x : np.char.strip(x.split(","))).tolist()[0]

array(['Fast Food', 'Rolls', 'Burger', 'Salad', 'Wraps'], dtype='<U9')

In [31]:
#filling the details of the cuisines dataframe by marking them with 1s 
for i in range(len(combined_data)):    
    cuisine_list = combined_data.Cuisines[i:(i+1)].apply(lambda x : np.char.strip(x.split(","))).tolist()
    cuisine_list = cuisine_list[0]
    
    for cuisine in cuisine_list:
        cuisines_DF.loc[i,cuisine] = 1

In [32]:
cuisines_DF.tail()

Unnamed: 0,Afghan,African,American,Andhra,Arabian,Asian,Assamese,Awadhi,BBQ,Bakery,...,Street Food,Sushi,Tamil,Tea,Tex-Mex,Thai,Tibetan,Turkish,Vietnamese,Wraps
13863,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13865,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13866,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
13867,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [33]:
cuisines_DF.rename(columns = {'Poké' : 'Poke'}, inplace = True)

In [34]:
print(combined_data.shape)
combined_data = pd.merge(combined_data, cuisines_DF, left_index=True, right_index=True)
print(combined_data.shape)

(13868, 8)
(13868, 109)


In [35]:
combined_data["Restaurant"].value_counts()

ID_5538    26
ID_7184    25
ID_2483    24
ID_4654    23
ID_7295    23
           ..
ID_4806     1
ID_8593     1
ID_528      1
ID_6280     1
ID_1425     1
Name: Restaurant, Length: 8661, dtype: int64

In [36]:
combined_data.groupby(['Restaurant']).city.transform('count')

0         1
1         6
2         3
3         2
4         1
         ..
13863     2
13864    11
13865     2
13866     1
13867     3
Name: city, Length: 13868, dtype: int64

CREATING MORE FEATURES

In [37]:
#minimum order zero which gives a value of 1 to samples where the minimum order is 0 and 0 when the minimum order is more than 0
combined_data['Minimum_Order_Zero'] = np.where(combined_data['Minimum_Order'] == 0, 1, 0)

combined_data['Reviews_by_Votes'] = combined_data['Reviews'] / combined_data['Votes']
combined_data['Minimum_Order_to_Cost'] = combined_data['Minimum_Order'] / combined_data['Average_Cost']
#mapping the number of restaurants for each city
combined_data["num_of_restaurants_city"] = combined_data["city"].map(combined_data.groupby("city").Restaurant.nunique())
#number of restaurants for each city 
combined_data["Restaurant_branch_count"] = combined_data["Restaurant"].map(combined_data["Restaurant"].value_counts())
#combined_data['votes_review_rating'] = combined_data.Votes*combined_data.Reviews*combined_data.Rating

In [38]:
combined_data[combined_data['Minimum_Order']==0].head(3)

Unnamed: 0,Restaurant,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,city,Afghan,African,...,Thai,Tibetan,Turkish,Vietnamese,Wraps,Minimum_Order_Zero,Reviews_by_Votes,Minimum_Order_to_Cost,num_of_restaurants_city,Restaurant_branch_count
13,ID_354,Beverages,50,0,3.8,184.0,128.0,Delhi,0,0,...,0,0,0,0,0,1,0.695652,0.0,1845,1
276,ID_7497,Maharashtrian,100,0,3.3,12.0,1.0,Pune,0,0,...,0,0,0,0,0,1,0.083333,0.0,1463,3
277,ID_7057,"North Indian, Chinese",300,0,3.6,102.0,48.0,Mumbai,0,0,...,0,0,0,0,0,1,0.470588,0.0,747,3


In [39]:
#Adding average restaurant data
#Average_restaurant_rating = combined_data.loc[combined_data.Rating != -999 ,:].groupby('Restaurant')['Rating'].agg([('avg_restaurant_Rating', 'mean')
#                                                                                                                   ]).reset_index()
#Average_restaurant_Review = combined_data.loc[combined_data.Rating != -999 ,:].groupby('Restaurant')['Reviews'].agg([('avg_restaurant_Reviews', 'mean')
#                                                                                                         ]).reset_index()
#Average_restaurant_Votes = combined_data.loc[combined_data.Rating != -999 ,:].groupby('Restaurant')['Votes'].agg([('avg_restaurant_Votes', 'mean')
#                                                                                                      ]).reset_index()

In [40]:
#combined_data = pd.merge(combined_data, Average_restaurant_rating, how = 'left', left_on = 'Restaurant', right_on = 'Restaurant')
#combined_data = pd.merge(combined_data, Average_restaurant_Review, how = 'left', left_on = 'Restaurant', right_on = 'Restaurant')
#combined_data = pd.merge(combined_data, Average_restaurant_Votes, how = 'left', left_on = 'Restaurant', right_on = 'Restaurant')

#columns = ['avg_restaurant_Rating', 'avg_restaurant_Reviews', 'avg_restaurant_Votes']

In [41]:
combined_data.isnull().sum()

Restaurant                 0
Cuisines                   0
Average_Cost               0
Minimum_Order              0
Rating                     0
                          ..
Minimum_Order_Zero         0
Reviews_by_Votes           0
Minimum_Order_to_Cost      0
num_of_restaurants_city    0
Restaurant_branch_count    0
Length: 114, dtype: int64

In [42]:
combined_data.head(3)

Unnamed: 0,Restaurant,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,city,Afghan,African,...,Thai,Tibetan,Turkish,Vietnamese,Wraps,Minimum_Order_Zero,Reviews_by_Votes,Minimum_Order_to_Cost,num_of_restaurants_city,Restaurant_branch_count
0,ID_6321,"Fast Food, Rolls, Burger, Salad, Wraps",200,50,3.5,12.0,4.0,Pune,0,0,...,0,0,0,0,1,0,0.333333,0.25,1463,1
1,ID_2882,"Ice Cream, Desserts",100,50,3.5,11.0,4.0,Bangalore,0,0,...,0,0,0,0,0,0,0.363636,0.5,2049,6
2,ID_1595,"Italian, Street Food, Fast Food",150,50,3.6,99.0,30.0,Mumbai,0,0,...,0,0,0,0,0,0,0.30303,0.333333,747,3


In [43]:
target = "Delivery_Time"
train_target = train[target]

In [44]:
# Converting the classes to integer values. 
# Since this is a multi class classfication problem. The class mapping will be useful when ensembling various models.
train_target = train_target.apply(lambda x: x.split()[0]).astype(int)
train_target

0        30
1        30
2        65
3        30
4        65
         ..
11089    30
11090    30
11091    30
11092    30
11093    30
Name: Delivery_Time, Length: 11094, dtype: int32

In [45]:
class_map = {}
class_map_rev = {}
for a,b in enumerate(sorted(train_target.unique())):
    class_map[b] = a
    class_map_rev[a] = b
print("class mapping {}".format(class_map))
print("class mapping reverse {}".format(class_map_rev))

class mapping {10: 0, 20: 1, 30: 2, 45: 3, 65: 4, 80: 5, 120: 6}
class mapping reverse {0: 10, 1: 20, 2: 30, 3: 45, 4: 65, 5: 80, 6: 120}


In [46]:
train_target = train_target.map(class_map)
train_target


0        2
1        2
2        4
3        2
4        4
        ..
11089    2
11090    2
11091    2
11092    2
11093    2
Name: Delivery_Time, Length: 11094, dtype: int64

In [47]:
#combined_data.drop('Restaurant',axis=1,inplace=True)
#numerical columns
num_cols = ['Votes', 'Reviews', 'Rating', 'Average_Cost', 'Minimum_Order', 
            'Restaurant_branch_count', 'num_of_restaurants_city', 'Reviews_by_Votes', 'Minimum_Order_to_Cost']
#
cat_cols = [col for col in combined_data.columns if col not in num_cols]
features = pd.get_dummies(combined_data.drop(num_cols, axis=1), columns=cat_cols, sparse=True)

In [48]:
print(len(cat_cols))
print(len(num_cols))

105
9


In [49]:
features

Unnamed: 0,Restaurant_ID_0,Restaurant_ID_1,Restaurant_ID_10,Restaurant_ID_100,Restaurant_ID_1000,Restaurant_ID_1001,Restaurant_ID_1002,Restaurant_ID_1003,Restaurant_ID_1004,Restaurant_ID_1005,...,Tibetan_0,Tibetan_1,Turkish_0,Turkish_1,Vietnamese_0,Vietnamese_1,Wraps_0,Wraps_1,Minimum_Order_Zero_0,Minimum_Order_Zero_1
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13863,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
13864,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
13865,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
13866,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0


In [50]:
features = features.sparse.to_coo()

In [51]:
import scipy
num_features=scipy.sparse.coo_matrix(combined_data[num_cols].values)

In [52]:
features=scipy.sparse.hstack([features, num_features]).tocsr()

In [53]:
train_ohe = features[:train.shape[0], :]
test_ohe = features[train.shape[0]:, :]

print(train_ohe.shape)
print(test_ohe.shape)

(11094, 11274)
(2774, 11274)


In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_ohe, train_target, test_size=0.20, random_state=314, stratify=train_target)

In [55]:
import lightgbm as lgb

lgb_fit_params={"early_stopping_rounds":50, 
            "eval_metric" : 'multi_logloss', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'verbose':100
           }

lgb_params = {'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'verbose': 0,
 'bagging_fraction': 0.8,
 'bagging_freq': 1,
 'num_class': 7,
 'feature_fraction': 0.8,
 'lambda_l1': 0.01,
 'lambda_l2': 0.01,
 'learning_rate': 0.1,
 'max_bin': 255,
 'max_depth': -1,
 'min_data_in_bin': 1,
 'min_data_in_leaf': 1,
 'num_leaves': 31}
lgb_params

{'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'verbose': 0,
 'bagging_fraction': 0.8,
 'bagging_freq': 1,
 'num_class': 7,
 'feature_fraction': 0.8,
 'lambda_l1': 0.01,
 'lambda_l2': 0.01,
 'learning_rate': 0.1,
 'max_bin': 255,
 'max_depth': -1,
 'min_data_in_bin': 1,
 'min_data_in_leaf': 1,
 'num_leaves': 31}

In [56]:
#light gbm classifier on the split data
clf_lgb = lgb.LGBMClassifier(n_estimators=10000, **lgb_params, random_state=123456789, n_jobs=-1)
clf_lgb.fit(X_train, y_train, **lgb_fit_params)
clf_lgb.best_iteration_

Training until validation scores don't improve for 50 rounds
[100]	valid's multi_logloss: 0.587573
[200]	valid's multi_logloss: 0.568043
[300]	valid's multi_logloss: 0.562964
Early stopping, best iteration is:
[305]	valid's multi_logloss: 0.562318


305

In [57]:
#light gbm classifier on the full data
clf_lgb_fulldata = lgb.LGBMClassifier(n_estimators=int(clf_lgb.best_iteration_*1.2), **lgb_params)
clf_lgb_fulldata.fit(train_ohe, train_target)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=1, feature_fraction=0.8,
               lambda_l1=0.01, lambda_l2=0.01, max_bin=255,
               metric='multi_logloss', min_data_in_bin=1, min_data_in_leaf=1,
               n_estimators=366, num_class=7, objective='multiclass',
               verbose=0)

In [58]:
#prediction of the lgbm model fit on the split data
pred = clf_lgb.predict_proba(X_test)
pred = np.argmax(pred,axis=1)
print(accuracy_score(pred, y_test))

0.8210905813429473


In [59]:
%%time
#random forest model with 2000 estimators fit on the full data
from sklearn.ensemble import RandomForestClassifier
clf_rf_fulldata=RandomForestClassifier(n_estimators=2000, max_features=0.1)
clf_rf_fulldata.fit(train_ohe, train_target)

Wall time: 27min 42s


RandomForestClassifier(max_features=0.1, n_estimators=2000)

In [60]:
%%time
#random forest model with 1000 estimators fit on the full data
clf_rf2_fulldata=RandomForestClassifier(n_estimators=1000, max_features=0.1)
clf_rf2_fulldata.fit(train_ohe, train_target)

Wall time: 16min 10s


RandomForestClassifier(max_features=0.1, n_estimators=1000)

In [61]:
%%time
#random forest model with 1000 estimators fit on the split data
clf_rf2=RandomForestClassifier(n_estimators=1000, max_features=0.1)
clf_rf2.fit(X_train, y_train)

Wall time: 11min 34s


RandomForestClassifier(max_features=0.1, n_estimators=1000)

In [62]:
#prediction of the randomforest model(1000 estimators) fit on the split data
pred_rf2_split = clf_rf2.predict_proba(X_test)
pred_rf2_split = np.argmax(pred_rf2_split,axis=1)
print(accuracy_score(pred_rf2_split, y_test))

0.8192879675529517


In [63]:
%%time
#random forest model with 2000 estimators fit on the split data
from sklearn.ensemble import RandomForestClassifier
clf_rf=RandomForestClassifier(n_estimators=2000, max_features=0.1)
clf_rf.fit(X_train, y_train)

Wall time: 22min 24s


RandomForestClassifier(max_features=0.1, n_estimators=2000)

In [64]:
#prediction of the randomforest model(2000 estimators) fit on the split data
pred_rf_split = clf_rf.predict_proba(X_test)
pred_rf_split = np.argmax(pred_rf_split,axis=1)
print(accuracy_score(pred_rf_split, y_test))

0.8188373141054529


In [65]:
#ensemble of the random forrest and lgbm model on the split data
pred_ensemble1 = np.mean((clf_lgb.predict_proba(X_test), 
                       clf_rf2.predict_proba(X_test)), axis=0)
pred_ensemble1 = np.argmax(pred_ensemble1, axis=1)
print(accuracy_score(pred_ensemble1, y_test))

0.8206399278954484


In [66]:
#ensemble using hmean of the random forrest and lgbm model on the split data
from scipy.stats import hmean
pred_ensemble2 = hmean((np.clip(clf_lgb.predict_proba(X_test), 0.001, 1),
                     np.clip(clf_rf2.predict_proba(X_test), 0.001, 1)), axis=0)
pred_ensemble2 = np.argmax(pred_ensemble2, axis=1)
print(accuracy_score(pred_ensemble2, y_test))

0.8210905813429473


In [67]:
#ensemble using hmean of the random forrest and lgbm model on the test data
from scipy.stats import hmean
prediction = hmean((np.clip(clf_lgb_fulldata.predict_proba(test_ohe), 0.001, 1),
                     np.clip(clf_rf2_fulldata.predict_proba(test_ohe), 0.001, 1)), axis=0)
prediction = np.argmax(prediction, axis=1)

In [68]:
submission = pd.DataFrame({target: pd.Series(prediction).map(class_map_rev).apply(lambda x: str(x)+" minutes")})
submission

Unnamed: 0,Delivery_Time
0,30 minutes
1,30 minutes
2,30 minutes
3,30 minutes
4,30 minutes
...,...
2769,30 minutes
2770,30 minutes
2771,45 minutes
2772,45 minutes


In [69]:
submission.to_excel('Result.xlsx', index=False)