In [1]:
# to handle the datasets
import pandas as pd
from pandas import DataFrame
import numpy as np
pd.pandas.set_option('display.max_columns', None)

# to plot graphs
import matplotlib.pyplot as plt
import seaborn as sns

# to build models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [2]:
# load train and test sets
X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')

In [3]:
X_train.head()

Unnamed: 0,points,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,playerName,oppositionTeam
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,10.6,4.3,0.0,1.5,0.0,60330.0,5.0,11,14
1,2.0,83.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.2,0.1,0.0,0.1,0.0,5393.0,5.5,197,5
2,0.0,90.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-44155.0,347655.0,5.5,291,0
3,2.0,84.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,15.3,6.0,2.3,0.0,132947.0,8.0,158,7
4,4.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2064.0,5.0,72,18


In [4]:
# set the targets
y_train = X_train['points']
y_test = X_test['points']

# drop unneeded vars from the train and test sets
X_train.drop(['points'], axis=1, inplace=True)
X_test.drop(['points'], axis=1, inplace=True)

## Feature selection
We do the model fitting and feature selection altogether in a few lines of code.

In [5]:
# 1st we specify the Lasso Regression model,
# the we select a suitable alpha (equivalent of penalty).
# the bigger the alpha the less features will be selected.

# SelectFromModel object from sklearn is used to select features
# whose coefficients are non-zero
sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=0))
# train Lasso model and select features
sel_.fit(X_train, y_train)
# visualise the features that were selected
sel_.get_support()

array([ True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True, False,  True,  True,  True, False, False,
       False,  True,  True,  True])

In [6]:
# create a list of the selected features
selected_feats = X_train.columns[(sel_.get_support())]

# print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feats)))
print('features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ ==0)))

total features: 22
selected features: 17
features with coefficients shrank to zero: 3


In [7]:
selected_feats

Index(['minsPlayed', 'goalsScored', 'assists', 'cleanSheets', 'goalsConceded',
       'ownGoals', 'penSaved', 'yelCards', 'redCards', 'saves', 'bonus',
       'influence', 'creativity', 'threat', 'costGBP', 'playerName',
       'oppositionTeam'],
      dtype='object')

In [8]:
# comparing selected_feats with the columns shows which columns were dropped
X_train.head()

Unnamed: 0,minsPlayed,goalsScored,assists,cleanSheets,goalsConceded,ownGoals,penSaved,penMissed,yelCards,redCards,saves,bonus,bonusPointSystem,influence,creativity,threat,ictIndex,netTransfers,selectedBy,costGBP,playerName,oppositionTeam
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,10.6,4.3,0.0,1.5,0.0,60330.0,5.0,11,14
1,83.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.2,0.1,0.0,0.1,0.0,5393.0,5.5,197,5
2,90.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-44155.0,347655.0,5.5,291,0
3,84.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,15.3,6.0,2.3,0.0,132947.0,8.0,158,7
4,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2064.0,5.0,72,18


In [9]:
# save the selected features
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)