In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [2]:
pd.pandas.set_option("display.max_columns", None)

In [3]:
X_train = pd.read_csv("/Users/bajajn2/Nakul/Projects/deployMachineLearningModels/xtrain.csv")
X_test = pd.read_csv("/Users/bajajn2/Nakul/Projects/deployMachineLearningModels/xtest.csv")

In [4]:
Y_train = X_train['SalePrice']
Y_test = X_test['SalePrice']


X_train.drop(['SalePrice','Id'],1, inplace=True)
X_test.drop(['SalePrice','Id'],1,inplace=True)

In [5]:
Y_train

0       12.211060
1       11.887931
2       12.675764
3       12.278393
4       12.103486
          ...    
1309    12.727838
1310    11.759786
1311    11.626254
1312    12.363076
1313    12.305918
Name: SalePrice, Length: 1314, dtype: float64

# Feature Selection

In [6]:
seed = 0

In [7]:
model = Lasso(alpha=0.005,random_state=seed)

In [8]:
model.fit(X_train, Y_train)

Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=0,
      selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
X_train.columns[model.coef_ != 0]

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
       'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'PavedDrive'],
      dtype='object')

In [10]:
sel = SelectFromModel(Lasso(alpha=0.005,random_state=seed))

In [11]:
sel.fit(X_train, Y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [12]:
sel.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False, False, False,  True, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [13]:
selected_feats = X_train.columns[(sel.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero: {}'.format(np.sum(sel.estimator_.coef_ == 0)))

total features: 82
selected features: 22
features with coefficients shrank to zero: 60


In [14]:
X_train.columns[model.coef_ != 0].ravel().tolist()

['MSSubClass',
 'MSZoning',
 'Neighborhood',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'RoofStyle',
 'MasVnrType',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 '1stFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenQual',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'PavedDrive']

In [15]:
# this is an alternative way of identifying the selected features
# based on the non-zero regularisation coefficients:

selected_feats = X_train.columns[(sel.estimator_.coef_ != 0).ravel().tolist()]

selected_feats

Index(['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
       'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'PavedDrive'],
      dtype='object')

In [16]:
pd.Series(selected_feats).to_csv('/Users/bajajn2/Nakul/Projects/deployMachineLearningModels/selected_features.csv', index=False)