In [1]:
import pandas as pd

from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
df[df< 0].value_counts() # No Negative Values

Series([], Name: count, dtype: int64)

In [4]:
X = df.drop("Outcome", axis=1)
y = df['Outcome']

### Chi-Squared statistical test

In [5]:
selector = SelectKBest(score_func=chi2, k=4)

In [6]:
X_new = selector.fit_transform(X, y)
X_new

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

In [7]:
X_new.shape  # 4 Features

(768, 4)

In [8]:
print("Selected features indices:", selector.get_support(indices=True))
print("Selected features names:", selector.get_feature_names_out())

Selected features indices: [1 4 5 7]
Selected features names: ['Glucose' 'Insulin' 'BMI' 'Age']


##### 4 best features ['Glucose' 'Insulin' 'BMI' 'Age']

### Recursive Feature Elimination

In [9]:
estimator = LogisticRegression(max_iter=800)

In [10]:
selector = RFE(
    estimator=estimator,
    n_features_to_select=3,
)

In [11]:
selector = selector.fit(X, y)

In [12]:
X_new = selector.transform(X)
X_new

array([[ 6.   , 33.6  ,  0.627],
       [ 1.   , 26.6  ,  0.351],
       [ 8.   , 23.3  ,  0.672],
       ...,
       [ 5.   , 26.2  ,  0.245],
       [ 1.   , 30.1  ,  0.349],
       [ 1.   , 30.4  ,  0.315]])

In [13]:
X_new.shape  # 3 Features

(768, 3)

In [14]:
print("Feature ranking:", selector.ranking_)
print("Selected features:", selector.support_)
print("Selected features indices:", selector.get_support(indices=True))
print("Selected features names:", selector.get_feature_names_out())

Feature ranking: [1 2 4 6 5 1 1 3]
Selected features: [ True False False False False  True  True False]
Selected features indices: [0 5 6]
Selected features names: ['Pregnancies' 'BMI' 'DiabetesPedigreeFunction']


##### Top 3 features ['Pregnancies' 'BMI' 'DiabetesPedigreeFunction']