# Machine Learning Pipeline - Feature Selection(Important to notice that we will build model with features selected from lasso and all features for random forest and XGB)

1. Data Analysis
2. Feature Engineering
3. **Feature Selection**
4. Model Training

In [17]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")
# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [18]:
# load the train and test set with the engineered variables
X_train = pd.read_csv('xtrain_scaled.csv')
X_test = pd.read_csv('xtest_scaled.csv')
X_train.head()

Unnamed: 0,order_amount,downpayment_amount,principal_amount,num_instalments_initial,test_ab_version,pm_bincode,pm_card_bank,pm_card_country_code,pm_card_level,pm_card_type,postcode,region_cp,province_cp,locality_cp,pm_is_prepaid,pm_has_bank,pm_is_national,ba_postcode,cust_phone_country,cust_phone_prefix,checkout_user_agent_browser_family,checkout_user_agent_browser_version,checkout_user_agent_device_family,checkout_user_agent_device_brand,checkout_user_agent_device_model,checkout_user_agent_os_family,checkout_user_agent_os_version,checkout_user_agent_is_pc,checkout_user_agent_is_mobile,checkout_user_agent_is_tablet,checkout_user_agent_is_touch_capable,n_ips,n_distinct_ips,total_price,device_screen,device_browser_version,device_browser_language,device_browser_type,device_cookies_enabled,real_ip_parent_organization,real_ip_city,real_ip_region,real_ip_isp,providerZ_score,ip_userType,ip_reputation,log_providerA_score,ip_address.traits.user_type,providerY_score,payment_method_card_type,bank_countrycode,creditcard_level,creditcard_type,birthday_year,n_item,pm_bincode_na,cust_phone_prefix_na,checkout_user_agent_is_pc_na,checkout_user_agent_is_mobile_na,checkout_user_agent_is_tablet_na,checkout_user_agent_is_touch_capable_na,device_cookies_enabled_na,providerZ_score_na,ip_reputation_na,log_providerA_score_na,providerY_score_na
0,0.931634,0.825557,0.927385,0.37931,0.0,0.549267,0.466667,0.5,0.111111,1.0,0.0,0.928571,0.740741,0.285714,0.0,1.0,1.0,1.0,1.0,0.0,0.636364,0.785714,0.4,0.0,0.4,0.5,0.75,0.0,1.0,0.0,1.0,0.102041,0.0,0.898705,0.8,0.666667,0.5,0.142857,1.0,0.55,0.625,0.714286,0.888889,0.0,0.666667,1.0,0.0,0.0,0.4,0.666667,0.5,0.285714,1.0,0.708333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.699385,0.574441,0.699721,0.551724,0.0,0.631802,0.866667,0.5,0.111111,1.0,0.0,0.857143,0.296296,0.285714,0.0,1.0,1.0,0.0,1.0,0.0,0.181818,0.928571,0.6,0.571429,0.6,0.0,0.75,1.0,0.0,0.0,0.0,0.040816,0.0,0.546072,0.25,0.962963,0.5,0.857143,1.0,0.55,0.625,0.047619,0.444444,0.110772,0.666667,1.0,0.0,0.0,0.6,0.666667,0.5,0.285714,1.0,0.902778,0.040816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.833097,0.659888,0.834303,0.965517,0.0,0.549273,0.466667,0.5,0.111111,1.0,0.0,0.857143,0.481481,0.285714,0.0,1.0,1.0,0.0,1.0,0.0,0.636364,0.214286,0.4,0.428571,0.4,0.5,0.75,0.0,1.0,0.0,1.0,0.081633,0.0,0.751263,0.8,0.62963,0.375,0.142857,1.0,0.55,0.625,0.380952,0.333333,0.496951,1.0,1.0,0.652363,0.5,0.6,0.666667,0.5,0.285714,1.0,0.472222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.713044,0.550666,0.715117,0.965517,0.0,0.822985,0.733333,0.5,0.888889,1.0,0.0,0.857143,0.851852,0.285714,1.0,1.0,1.0,0.0,1.0,0.0,0.636364,0.214286,0.4,0.714286,0.4,0.5,0.75,0.0,1.0,0.0,1.0,0.040816,0.0,0.562842,0.8,0.62963,0.5,0.142857,1.0,0.3,0.625,0.142857,0.333333,0.0,0.666667,1.0,0.0,0.0,0.4,1.0,0.5,0.571429,1.0,0.513889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.973092,0.871215,0.967814,0.344828,0.0,0.613849,0.8,0.0,0.0,0.0,0.0,0.428571,0.259259,0.285714,0.0,1.0,0.0,0.0,1.0,0.0,0.636364,0.142857,0.4,0.428571,0.4,0.5,0.75,0.0,1.0,0.0,1.0,0.183673,0.0,0.960739,0.15,0.962963,0.125,0.142857,1.0,0.55,0.625,0.666667,0.666667,0.218496,0.666667,1.0,0.63031,0.0,0.4,0.333333,0.0,0.142857,0.0,0.847222,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')
y_train.head()

Unnamed: 0,target
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


### Feature Selection
Let's go ahead and select a subset of the most predictive features. There is an element of randomness in the Lasso regression, so we will set the seed.

In [20]:
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))
# train Lasso model and select features
sel_.fit(X_train, y_train)

In [23]:
# let's visualise those features that were selected.
# (selected features marked with True)

sel_.get_support()

array([False, False, False,  True,  True, False,  True, False, False,
       False, False, False,  True, False,  True,  True, False, False,
       False, False,  True,  True, False,  True,  True, False,  True,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

In [22]:
# let's print the number of total and selected features
sel_.get_support().sum()

24

In [24]:
selected_feats = X_train.columns[(sel_.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 66
selected features: 24
features with coefficients shrank to zero: 42


In [25]:
# print the selected features
selected_feats

Index(['num_instalments_initial', 'test_ab_version', 'pm_card_bank',
       'province_cp', 'pm_is_prepaid', 'pm_has_bank',
       'checkout_user_agent_browser_family',
       'checkout_user_agent_browser_version',
       'checkout_user_agent_device_brand', 'checkout_user_agent_device_model',
       'checkout_user_agent_os_version', 'device_screen',
       'device_browser_version', 'real_ip_parent_organization', 'real_ip_city',
       'real_ip_region', 'real_ip_isp', 'providerZ_score',
       'log_providerA_score', 'ip_address.traits.user_type', 'providerY_score',
       'payment_method_card_type', 'creditcard_level', 'creditcard_type'],
      dtype='object')

In [27]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)