### Installing mlxtend library

In [1]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.3-py3-none-any.whl (1.4 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.3


You should consider upgrading via the 'C:\Users\julien\workspace\building-first-scikit-learn-solution\venv\Scripts\python.exe -m pip install --upgrade pip' command.


### Importing libraries

In [2]:
import pandas as pd
import numpy as np

In [3]:
diabetes_data = pd.read_csv('../datasets/diabetes.csv')

diabetes_data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [4]:
X = diabetes_data.drop('Outcome', axis=1)

Y = diabetes_data['Outcome']

### Recursive Feature Elimination
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

In [5]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [6]:
model = LogisticRegression(solver='liblinear')

rfe = RFE(model, n_features_to_select=4)

In [7]:
fit = rfe.fit(X, Y)

In [8]:
print("Num Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

Num Features:  4
Selected Features:  [ True  True False False False  True  True False]
Feature Ranking:  [1 1 2 4 5 1 1 3]


In [9]:
feature_rank = pd.DataFrame({'columns': X.columns, 
                             'ranking': fit.ranking_, 
                             'selected': fit.support_})

feature_rank

Unnamed: 0,columns,ranking,selected
0,Pregnancies,1,True
1,Glucose,1,True
2,BloodPressure,2,False
3,SkinThickness,4,False
4,Insulin,5,False
5,BMI,1,True
6,DiabetesPedigreeFunction,1,True
7,Age,3,False


In [10]:
recursive_feature_names = feature_rank.loc[feature_rank['selected'] == True]

recursive_feature_names

Unnamed: 0,columns,ranking,selected
0,Pregnancies,1,True
1,Glucose,1,True
5,BMI,1,True
6,DiabetesPedigreeFunction,1,True


In [11]:
X[recursive_feature_names['columns'].values].head()

Unnamed: 0,Pregnancies,Glucose,BMI,DiabetesPedigreeFunction
0,6,148,33.6,0.627
1,1,85,26.6,0.351
2,8,183,23.3,0.672
3,1,89,28.1,0.167
4,0,137,43.1,2.288


In [12]:
recursive_features = X[recursive_feature_names['columns'].values]

### Forward Elimination

#### SequentialFeatureSelector
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/

In [13]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

In [14]:
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_estimators=10),
                                             k_features=4,
                                             forward=True,
                                             scoring='accuracy',
                                             cv=4)

features = feature_selector.fit(np.array(X), Y)

In [15]:
forward_elimination_feature_names = list(X.columns[list(features.k_feature_idx_)])

forward_elimination_feature_names

['Pregnancies', 'Glucose', 'Insulin', 'BMI']

In [16]:
forward_elimination_features = X[forward_elimination_feature_names]

### Backward Elimination 

In [17]:
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_estimators=10),
                                             k_features=4,
                                             forward=False,
                                             scoring='accuracy',
                                             cv=4)

features = feature_selector.fit(np.array(X), Y)

In [18]:
back_elimination_feature_names = list(X.columns[list(features.k_feature_idx_)])

back_elimination_feature_names 

['Pregnancies', 'Glucose', 'Insulin', 'BMI']

In [19]:
back_elimination_features = X[back_elimination_feature_names]

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score

In [21]:
def build_model(X, Y, test_frac):
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    
    model = LogisticRegression(solver='liblinear').fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print("Test_score : ", accuracy_score(y_test, y_pred))

In [22]:
build_model(X, Y, 0.2)

Test_score :  0.8116883116883117


In [23]:
build_model(recursive_features, Y, 0.2)

Test_score :  0.7597402597402597


In [24]:
build_model(forward_elimination_features, Y, 0.2)

Test_score :  0.7012987012987013


In [25]:
build_model(back_elimination_features, Y, 0.2)

Test_score :  0.7597402597402597
