## Model Training
### 1.1 Import Data and Required Packages
Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# for the yeo-johnson transformation
import scipy.stats as stats

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# to save the trained scaler class
import joblib

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings

In [3]:
# load dataset
data = pd.read_csv('finalTrain.csv')

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(45584, 20)


Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min)
0,0xcdcd,DEHRES17DEL01,36.0,4.2,30.327968,78.046106,30.397968,78.116106,12-02-2022,21:55,22:10,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46
1,0xd987,KOCRES16DEL01,21.0,4.7,10.003064,76.307589,10.043064,76.347589,13-02-2022,14:55,15:05,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23
2,0x2784,PUNERES13DEL03,23.0,4.7,18.56245,73.916619,18.65245,74.006619,04-03-2022,17:30,17:40,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21
3,0xc8b6,LUDHRES15DEL02,34.0,4.3,30.899584,75.809346,30.919584,75.829346,13-02-2022,09:20,09:30,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20
4,0xdb64,KNPRES14DEL02,24.0,4.7,26.463504,80.372929,26.593504,80.502929,14-02-2022,19:50,20:05,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41


In [4]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Time_taken (min)'], axis=1), # predictive variables
    data['Time_taken (min)'], # target
    test_size=0.1, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((41025, 19), (4559, 19))

In [5]:
# let's identify the categorical variables
# we will capture those of type object

cat_vars = [var for var in data.columns if data[var].dtype == 'O']

# MSSubClass is also categorical by definition, despite its numeric values
# (you can find the definitions of the variables in the data_description.txt
# file available on Kaggle, in the same website where you downloaded the data)



# cast all variables as categorical
X_train[cat_vars] = X_train[cat_vars].astype('O')
X_test[cat_vars] = X_test[cat_vars].astype('O')

# number of categorical variables
len(cat_vars)


11

In [6]:
cat_vars

['ID',
 'Delivery_person_ID',
 'Order_Date',
 'Time_Orderd',
 'Time_Order_picked',
 'Weather conditions',
 'Road_traffic_density',
 'Type_of_order',
 'Type_of_vehicle',
 'Festival',
 'City']

In [7]:
cat_vars_with_na = [
    var for var in cat_vars
    if X_train[var].isnull().sum() > 0
]

# print percentage of missing values per variable
X_train[cat_vars_with_na ].isnull().mean().sort_values(ascending=False)

Time_Orderd             0.037831
City                    0.026179
Weather conditions      0.013309
Road_traffic_density    0.012968
Festival                0.004778
dtype: float64

In [8]:

# variables to impute with the string missing
with_string_missing = [
    var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1]

# variables to impute with the most frequent category
with_frequent_category = [
    var for var in cat_vars_with_na if X_train[var].isnull().mean() < 0.1]

In [9]:
with_string_missing

[]

In [10]:
for var in with_frequent_category:
    
    # there can be more than 1 mode in a variable
    # we take the first one with [0]    
    mode = X_train[var].mode()[0]
    
    print(var, mode)
    
    X_train[var].fillna(mode, inplace=True)
    X_test[var].fillna(mode, inplace=True)

Time_Orderd 17:55
Weather conditions Fog
Road_traffic_density Low
Festival No
City Metropolitian


In [11]:
# check that we have no missing information in the engineered variables

X_train[cat_vars_with_na].isnull().sum()

Time_Orderd             0
Weather conditions      0
Road_traffic_density    0
Festival                0
City                    0
dtype: int64

In [12]:
[var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0]

[]

In [13]:
# now let's identify the numerical variables

num_vars = [
    var for var in X_train.columns if var not in cat_vars and var != 'Time_taken (min)'
]

# number of numerical variables
len(num_vars)

8

In [14]:
# make a list with the numerical variables that contain missing values
vars_with_na = [
    var for var in num_vars
    if X_train[var].isnull().sum() > 0
]

# print percentage of missing values per variable
X_train[vars_with_na].isnull().mean()

Delivery_person_Age        0.040731
Delivery_person_Ratings    0.041950
multiple_deliveries        0.021962
dtype: float64

In [15]:
# replace missing values as we described above

for var in vars_with_na:

    # calculate the mean using the train set
    mean_val = X_train[var].mean()
    
    print(var, mean_val)

    # add binary missing indicator (in train and test)
    X_train[var + '_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var + '_na'] = np.where(X_test[var].isnull(), 1, 0)

    # replace missing values by the mean
    # (in train and test)
    X_train[var].fillna(mean_val, inplace=True)
    X_test[var].fillna(mean_val, inplace=True)

# check that we have no more missing values in the engineered variables
X_train[vars_with_na].isnull().sum()

Delivery_person_Age 29.56789652894242
Delivery_person_Ratings 4.633950742926929
multiple_deliveries 0.746261589073871


Delivery_person_Age        0
Delivery_person_Ratings    0
multiple_deliveries        0
dtype: int64

In [16]:

# check that test set does not contain null values in the engineered variables

[var for var in vars_with_na if X_test[var].isnull().sum() > 0]

[]

In [17]:
# check absence of na in the train set
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

In [18]:
cat_others = [
    var for var in cat_vars
]

len(cat_others)

11

In [19]:
def find_frequent_labels(df, var, rare_perc):
    
    # function finds the labels that are shared by more than
    # a certain % of the houses in the dataset

    df = df.copy()

    tmp = df.groupby(var)[var].count() / len(df)

    return tmp[tmp > rare_perc].index


for var in cat_others:
    
    # find the frequent categories
    frequent_ls = find_frequent_labels(X_train, var, 0.01)
    
    print(var, frequent_ls)
    print()
    
    # replace rare categories by the string "Rare"
    X_train[var] = np.where(X_train[var].isin(
        frequent_ls), X_train[var], 'Rare')
    
    X_test[var] = np.where(X_test[var].isin(
        frequent_ls), X_test[var], 'Rare')

ID Index([], dtype='object', name='ID')

Delivery_person_ID Index([], dtype='object', name='Delivery_person_ID')

Order_Date Index(['01-03-2022', '01-04-2022', '02-03-2022', '02-04-2022', '03-03-2022',
       '03-04-2022', '04-03-2022', '04-04-2022', '05-03-2022', '05-04-2022',
       '06-03-2022', '06-04-2022', '07-03-2022', '08-03-2022', '09-03-2022',
       '10-03-2022', '11-02-2022', '11-03-2022', '12-02-2022', '12-03-2022',
       '13-02-2022', '13-03-2022', '14-02-2022', '14-03-2022', '15-02-2022',
       '15-03-2022', '16-02-2022', '16-03-2022', '17-02-2022', '17-03-2022',
       '18-02-2022', '18-03-2022', '19-03-2022', '20-03-2022', '21-03-2022',
       '23-03-2022', '24-03-2022', '25-03-2022', '26-03-2022', '27-03-2022',
       '28-03-2022', '29-03-2022', '30-03-2022', '31-03-2022'],
      dtype='object', name='Order_Date')

Time_Orderd Index(['17:55', '21:55'], dtype='object', name='Time_Orderd')

Time_Order_picked Index(['18:05', '18:40', '20:50', '21:30', '21:45', '22:50']

In [20]:
# this function will assign discrete values to the strings of the variables,
# so that the smaller value corresponds to the category that shows the smaller
# mean house sale price

def replace_categories(train, test, y_train, var, target):
    
    tmp = pd.concat([X_train, y_train], axis=1)
    
    # order the categories in a variable from that with the lowest
    # house sale price, to that with the highest
    ordered_labels = tmp.groupby([var])[target].mean().sort_values().index

    # create a dictionary of ordered categories to integer values
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}
    
    print(var, ordinal_label)
    print()

    # use the dictionary to replace the categorical strings by integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [21]:
for var in cat_others:
    replace_categories(X_train, X_test, y_train, var, 'Time_taken (min)')

ID {'Rare': 0}

Delivery_person_ID {'Rare': 0}

Order_Date {'24-03-2022': 0, '26-03-2022': 1, '05-03-2022': 2, '03-04-2022': 3, '07-03-2022': 4, '03-03-2022': 5, '19-03-2022': 6, '05-04-2022': 7, '17-03-2022': 8, '11-03-2022': 9, '01-03-2022': 10, '15-02-2022': 11, '13-02-2022': 12, '30-03-2022': 13, '17-02-2022': 14, '13-03-2022': 15, '09-03-2022': 16, '28-03-2022': 17, '01-04-2022': 18, '11-02-2022': 19, '15-03-2022': 20, '21-03-2022': 21, '04-04-2022': 22, '31-03-2022': 23, '04-03-2022': 24, '02-03-2022': 25, '27-03-2022': 26, '06-04-2022': 27, '18-02-2022': 28, '12-03-2022': 29, '02-04-2022': 30, '25-03-2022': 31, '16-03-2022': 32, '08-03-2022': 33, '14-02-2022': 34, '10-03-2022': 35, '06-03-2022': 36, '29-03-2022': 37, '20-03-2022': 38, '14-03-2022': 39, '23-03-2022': 40, '12-02-2022': 41, '18-03-2022': 42, '16-02-2022': 43}

Time_Orderd {'Rare': 0, '17:55': 1, '21:55': 2}

Time_Order_picked {'22:50': 0, 'Rare': 1, '18:05': 2, '18:40': 3, '21:30': 4, '21:45': 5, '20:50': 6}

Weath

In [22]:
X_train

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Delivery_person_Age_na,Delivery_person_Ratings_na,multiple_deliveries_na
1707,0,0,30.0,4.6,13.049645,80.242268,13.069645,80.262268,6,0,1,3,0,2,0,1,1.0,0,1,0,0,0
44721,0,0,22.0,4.8,17.438263,78.397865,17.568263,78.527865,32,0,1,1,3,2,0,1,1.0,0,1,0,0,0
39844,0,0,39.0,4.8,17.450851,78.379347,17.500851,78.429347,4,0,1,3,0,2,2,3,1.0,0,1,0,0,0
17703,0,0,32.0,4.7,17.433809,78.386744,17.523809,78.476744,38,0,1,2,0,0,0,3,1.0,0,1,0,0,0
25105,0,0,21.0,4.6,21.173343,72.792731,21.203343,72.822731,3,0,1,5,1,2,0,1,1.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30403,0,0,33.0,4.6,18.533811,73.899315,18.613811,73.979315,37,0,1,0,1,2,3,3,1.0,0,1,0,0,0
21243,0,0,38.0,4.6,22.745536,75.893106,22.875536,76.023106,39,0,1,4,0,0,2,3,2.0,1,1,0,0,0
42613,0,0,37.0,4.3,12.906229,77.596791,12.916229,77.606791,5,0,1,1,2,1,3,1,1.0,0,1,0,0,0
43567,0,0,30.0,4.7,18.530963,73.828972,18.540963,73.838972,7,0,1,2,2,2,2,3,1.0,0,0,0,0,0


In [23]:
X_train.isnull().sum()

ID                             0
Delivery_person_ID             0
Delivery_person_Age            0
Delivery_person_Ratings        0
Restaurant_latitude            0
Restaurant_longitude           0
Delivery_location_latitude     0
Delivery_location_longitude    0
Order_Date                     0
Time_Orderd                    0
Time_Order_picked              0
Weather conditions             0
Road_traffic_density           0
Vehicle_condition              0
Type_of_order                  0
Type_of_vehicle                0
multiple_deliveries            0
Festival                       0
City                           0
Delivery_person_Age_na         0
Delivery_person_Ratings_na     0
multiple_deliveries_na         0
dtype: int64

In [24]:
from sklearn.feature_selection import mutual_info_regression
# determine the mutual information
mutual_info = mutual_info_regression(X_train, y_train)
mutual_info

array([0.        , 0.        , 0.0773822 , 0.16464257, 0.00623618,
       0.00910503, 0.06465092, 0.05569989, 0.08581925, 0.        ,
       0.        , 0.04689043, 0.11736263, 0.06527494, 0.        ,
       0.02148749, 0.11397189, 0.04735154, 0.02215886, 0.00379897,
       0.        , 0.        ])

In [25]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

Delivery_person_Ratings        0.164643
Road_traffic_density           0.117363
multiple_deliveries            0.113972
Order_Date                     0.085819
Delivery_person_Age            0.077382
Vehicle_condition              0.065275
Delivery_location_latitude     0.064651
Delivery_location_longitude    0.055700
Festival                       0.047352
Weather conditions             0.046890
City                           0.022159
Type_of_vehicle                0.021487
Restaurant_longitude           0.009105
Restaurant_latitude            0.006236
Delivery_person_Age_na         0.003799
Delivery_person_Ratings_na     0.000000
ID                             0.000000
Type_of_order                  0.000000
Delivery_person_ID             0.000000
Time_Order_picked              0.000000
Time_Orderd                    0.000000
multiple_deliveries_na         0.000000
dtype: float64

In [29]:
X_train.to_csv('xtrain.csv', index=False)
X_test.to_csv('xtest.csv', index=False)

y_train.to_csv('ytrain.csv', index=False)
y_test.to_csv('ytest.csv', index=False)

In [None]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)
