## Feature selection

### Use the diabetes dataset

In [1]:
import pandas as pd

In [2]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [3]:
df_diabetes = pd.read_csv('diabetes.csv', names = names)

In [4]:
df_diabetes.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
X = df_diabetes[['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']]

In [6]:
Y = df_diabetes['class']

## 1) Remove features with low variance

In [32]:
from sklearn.feature_selection import VarianceThreshold

In [33]:
var = VarianceThreshold(threshold=0.3)
var = var.fit(X,Y)

In [37]:
cols = var.get_support(indices=True)
cols

array([0, 1, 2, 3, 4, 5, 7], dtype=int64)

In [38]:
features = df_diabetes.columns[cols]
features

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'age'], dtype='object')

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

## KBest Fit

In [9]:
BestFeatures = SelectKBest(score_func = chi2, k = 4)

In [10]:
KBestFit = BestFeatures.fit(X,Y)

### Scores and p values for all features

In [11]:
print(KBestFit.scores_)

[ 111.51969064 1411.88704064   17.60537322   53.10803984 2175.56527292
  127.66934333    5.39268155  181.30368904]


In [12]:
print(KBestFit.pvalues_)

[4.55261043e-026 5.48728628e-309 2.71819252e-005 3.15697650e-013
 0.00000000e+000 1.32590849e-029 2.02213728e-002 2.51638830e-041]


### Get the best fit feature Matrix

In [13]:
features = KBestFit.transform(X)

In [14]:
features

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

### Get the names of the best fit features

In [15]:
cols = KBestFit.get_support(indices=True)

In [16]:
cols

array([1, 4, 5, 7], dtype=int64)

In [17]:
col_names = df_diabetes.columns[cols]

In [18]:
df_KBestFit = df_diabetes[col_names]

In [19]:
df_KBestFit.head()

Unnamed: 0,plas,test,mass,age
0,148,0,33.6,50
1,85,0,26.6,31
2,183,0,23.3,32
3,89,94,28.1,21
4,137,168,43.1,33


## Recursive Feature Elimination

In [20]:
from sklearn.feature_selection import RFE

In [21]:
estimator = xgboost.XGBClassifier()

NameError: name 'xgboost' is not defined

In [None]:
rfe = RFE(estimator=estimator, n_features_to_select=4, step=1)

In [None]:
RFeatures = rfe.fit(X, Y)

In [None]:
rfe.ranking_

In [None]:
cols = RFeatures.get_support(indices=True)

In [None]:
col_names = df_diabetes.columns[cols]

In [None]:
df_RFE = df_diabetes[col_names]

In [None]:
df_RFE.head()

## Recursive Feature Elimination with Cross Validation

### With XGBoost

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
estimator = xgboost.XGBClassifier()

In [None]:
rfecv = RFECV(estimator, cv=5, n_jobs=-1, scoring='accuracy', min_features_to_select = 4)
rfecv = rfecv.fit(X, Y)

In [None]:
rfecv.ranking_

In [None]:
rfecv.n_features_

In [None]:
rfecv.grid_scores_
# These are for the 5 cross validation grids

In [None]:
## Best features
cols = rfecv.get_support(indices=True)

In [None]:
cols

In [None]:
col_names = df_diabetes.columns[cols]

In [None]:
df_RFECV = df_diabetes[col_names]

In [None]:
df_RFECV.head()

### With Random Forests

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
estimator = RandomForestClassifier()

In [None]:
rfecv = RFECV(estimator, cv=5, n_jobs=-1, scoring='accuracy', min_features_to_select = 4)
rfecv = rfecv.fit(X, Y)

In [None]:
rfecv.ranking_

In [None]:
rfecv.n_features_

In [None]:
## Best features
cols = rfecv.get_support(indices=True)

In [None]:
col_names = df_diabetes.columns[cols]

In [None]:
df_RFECV = df_diabetes[col_names]

In [None]:
df_RFECV.head()

##### Probably XGBoost is not the best estimator for RFE or XGBoost here doesn't require feature elimination

## Prinicipal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=3)

In [None]:
fit = pca.fit(X)

In [None]:
fit.components_

In [None]:
feature_matrix = pca.fit_transform(X)

In [None]:
pd.DataFrame(feature_matrix).head()

##  Tree-based feature selection: Feature Importance With ExtraTreesClassifier (Note Bagged Decision Trees could also be used)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
estimator = ExtraTreesClassifier(n_estimators=10)
estimator = estimator.fit(X, Y)

In [None]:
estimator.feature_importances_

In [None]:
model = SelectFromModel(estimator, prefit=True)

In [None]:
X_new = model.transform(X)

In [None]:
X_new

In [None]:
X_new.shape 

In [None]:
cols = model.get_support(indices = True)

In [None]:
cols

In [None]:
col_names = df_diabetes.columns[cols]

In [None]:
features_selected = col_names

In [None]:
features_selected 

In [None]:
df_features_selected = df_diabetes[col_names]
df_features_selected.head()

In [None]:
df_feature_matrix = pd.DataFrame(X_new) 

In [None]:
df_feature_matrix.head()

# Grid Search

### Randomised Grid Search

### Random Forests

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 20, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
# Create the random grid
# Note: The Grid is only used for Grid Search, for Randomized Search CV we use parameter distribution instead. 
# Here, the parameter grid could also be used as specifying the grid distribution 
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions= random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

In [None]:
rf_random.best_score_

In [None]:
rf_random.best_params_

### By default, the Randomized Grid Search returns the best scoring model, which we could use for predictions

In [None]:
estimator = rf_random

### Testing accuracy

In [None]:
estimator.score(x_test, y_test)