In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('Pima.csv', index_col = 0)
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [9]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [11]:
test = SelectKBest(score_func = chi2, k = 5)
chi = test.fit(x,y)

In [12]:
chi.scores_

array([ 111.51969064, 1411.88704064,   17.60537322,   53.10803984,
       2175.56527292,  127.66934333,    5.39268155,  181.30368904])

In [13]:
x.columns

Index(['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age'], dtype='object')

In [14]:
chidf = pd.DataFrame(chi.scores_).T
chidf.columns = x.columns

In [15]:
chidf

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,111.519691,1411.887041,17.605373,53.10804,2175.565273,127.669343,5.392682,181.303689


# RFE

In [16]:
lg = LogisticRegression()

rfe = RFE(lg, n_features_to_select = 5)

rfe.fit(x,y)

RFE(estimator=LogisticRegression(), n_features_to_select=5)

In [17]:
rfe.ranking_

array([1, 1, 2, 3, 4, 1, 1, 1])

In [18]:
rfe.support_

array([ True,  True, False, False, False,  True,  True,  True])

In [19]:
rfe_df = pd.DataFrame(rfe.ranking_).T 
rfe_df.columns = x.columns

In [20]:
rfe_df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,1,1,2,3,4,1,1,1


# Tree Based Approach

In [21]:
dt = DecisionTreeClassifier(criterion = 'entropy')

In [22]:
dt.fit(x,y)

DecisionTreeClassifier(criterion='entropy')

In [23]:
dt_df = pd.DataFrame(dt.feature_importances_).T
dt_df.columns = x.columns

In [24]:
dt_df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,0.064835,0.239686,0.093859,0.035919,0.035431,0.255643,0.125806,0.148822


- univariate selection:
     - test, plas, age, mass
- REF:
     - preg, plas, mass, pedi, age
- Tree:
     - plas, mass, pedi, age

# Cross Validation

In [25]:
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_score

In [26]:
kf = KFold(n_splits = 10, random_state = 1, shuffle = True)
score = cross_val_score(LogisticRegression(), x,y, cv = kf)

In [27]:
score.mean()

0.7720608339029391

In [28]:
# Leave one out

In [29]:
df.shape

(768, 9)

In [30]:
le = LeaveOneOut()
le_score = cross_val_score(LogisticRegression(), x,y, cv = le)

In [31]:
le_score

array([1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
       1., 1., 0., 1., 1.

In [32]:
le_score.mean()

0.7786458333333334