In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
# load the train and test set with the engineered variables

# we built and saved these datasets in the previous lecture.
# If you haven't done so, go ahead and check the previous notebook
# to find out how to create these datasets

X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')

X_train.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weather conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Delivery_person_Age_na,Delivery_person_Ratings_na,multiple_deliveries_na
0,0,0,30.0,4.6,13.049645,80.242268,13.069645,80.262268,6,0,1,3,0,2,0,1,1.0,0,1,0,0,0
1,0,0,22.0,4.8,17.438263,78.397865,17.568263,78.527865,32,0,1,1,3,2,0,1,1.0,0,1,0,0,0
2,0,0,39.0,4.8,17.450851,78.379347,17.500851,78.429347,4,0,1,3,0,2,2,3,1.0,0,1,0,0,0
3,0,0,32.0,4.7,17.433809,78.386744,17.523809,78.476744,38,0,1,2,0,0,0,3,1.0,0,1,0,0,0
4,0,0,21.0,4.6,21.173343,72.792731,21.203343,72.822731,3,0,1,5,1,2,0,1,1.0,0,0,0,0,0


In [3]:
# load the target (remember that the target is log transformed)
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')

y_train.head()

Unnamed: 0,Time_taken (min)
0,22
1,23
2,29
3,22
4,19


In [4]:
# We will do the model fitting and feature selection
# altogether in a few lines of code

# first, we specify the Lasso Regression model, and we
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then we use the selectFromModel object from sklearn, which
# will select automatically the features which coefficients are non-zero

# remember to set the seed, the random state in this function
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))

# train Lasso model and select features
sel_.fit(X_train, y_train)

In [5]:
sel_.get_support().sum()

19

In [6]:
sel_.get_support()

array([False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True])

In [7]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feats = X_train.columns[(sel_.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 22
selected features: 19
features with coefficients shrank to zero: 3


In [8]:
selected_feats

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude',
       'Restaurant_longitude', 'Delivery_location_latitude',
       'Delivery_location_longitude', 'Order_Date', 'Time_Orderd',
       'Time_Order_picked', 'Weather conditions', 'Road_traffic_density',
       'Vehicle_condition', 'Type_of_order', 'Type_of_vehicle',
       'multiple_deliveries', 'Festival', 'City', 'Delivery_person_Age_na',
       'multiple_deliveries_na'],
      dtype='object')