In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

In [14]:
# Get imputed data
mimic_complete = pd.read_csv("./impute_mimic.csv")
mimic_complete.head(10)

Unnamed: 0.1,Unnamed: 0,id,mort_28,age,sex,weight,height,pf_ratio,po2,pco2,...,fio2,hco3,heart_rate,minute_volume,peep,plateau_pressure,respiratory_rate,syst_blood_pressure,diastolic_blood_pressure,peep_regime
0,0,32128372,0.0,75.0,0.0,93.0,168.0,241.944444,130.666667,28.111111,...,50.0,12.0,115.408163,8.241111,8.777778,24.333333,18.943662,100.6,68.2,0.0
1,1,34100191,0.0,47.0,1.0,127.0,183.0,166.356061,113.076923,34.461538,...,68.5,12.75,115.407407,16.628571,7.375,17.5,19.857143,99.0,68.0,0.0
2,2,38292466,0.0,85.0,1.0,77.5,170.0,224.0,112.0,38.0,...,51.818182,23.0,70.041667,9.433333,5.0,22.333333,15.833333,102.0,46.333333,0.0
3,3,32743332,0.0,85.0,1.0,90.7,170.0,205.714286,72.0,38.0,...,36.875,27.0,70.48,7.478571,5.0,24.0,18.2,108.083333,46.208333,0.0
4,4,35009126,1.0,64.0,1.0,70.0,183.0,173.777778,94.666667,38.666667,...,52.5,21.666667,73.955556,7.05,5.428571,14.328571,15.016667,103.415094,57.377358,0.0
5,5,38740124,0.0,38.0,1.0,84.1,178.0,345.0,199.0,44.0,...,60.0,25.5,102.909091,7.725,5.75,18.275128,16.230769,126.181818,78.909091,0.0
6,6,32359580,1.0,69.0,1.0,86.2,185.0,354.72381,213.5,32.0,...,66.25,9.888889,93.5,10.5,8.333333,25.5,23.852941,91.0,56.363636,0.0
7,7,39880770,0.0,28.0,0.0,120.0,170.0,770.0,308.0,35.0,...,51.111111,25.0,95.193548,7.842857,5.625,15.0,19.526316,112.62069,51.206897,0.0
8,8,37153661,1.0,82.0,0.0,46.3,157.0,140.071429,109.0,36.0,...,60.0,17.5,96.090909,7.9625,6.25,19.0,21.292683,100.0,46.235294,0.0
9,9,33630048,0.0,70.0,0.0,63.7,157.0,310.0,126.0,29.666667,...,50.0,16.0,70.72,6.516667,5.0,16.5,16.451613,100.705882,54.470588,0.0


In [15]:
# Create variables to store outcome Y, treatment T, and features X
y = "mort_28"
T = "peep_regime"
X = ["age", "sex", "weight", "height", "pf_ratio", "po2", "pco2", "ph", "driving_pressure", "lung_compliance", "map", "bilirubin", "creatinine", "platelets", "urea", "fio2", "hco3", "heart_rate", "minute_volume", "peep", "plateau_pressure", "respiratory_rate", "syst_blood_pressure", "diastolic_blood_pressure"]

In [16]:
# Train and test set
train, test = train_test_split(mimic_complete, test_size=0.1)
train[X].shape

(3546, 24)

### Correlation

In [17]:
# Correlation to identify potentially important features
correlations = train[X + [y]].corr()
correlation_with_target = correlations[y].drop(y)
sorted_correlation_with_target = correlation_with_target.abs().sort_values(ascending=False)
print(sorted_correlation_with_target)

age                         0.152554
urea                        0.128442
weight                      0.095732
hco3                        0.092184
respiratory_rate            0.080510
po2                         0.064724
creatinine                  0.064595
bilirubin                   0.060566
pco2                        0.058861
lung_compliance             0.058737
ph                          0.053391
height                      0.038754
minute_volume               0.032690
heart_rate                  0.026476
fio2                        0.024717
sex                         0.024282
diastolic_blood_pressure    0.021993
plateau_pressure            0.021573
platelets                   0.021555
map                         0.016772
driving_pressure            0.014916
pf_ratio                    0.011666
syst_blood_pressure         0.009317
peep                        0.003975
Name: mort_28, dtype: float64


In [18]:
# Features with higher abs values of correlation with T are more predictive
# Select features that have abs correlation > 0.1
threshold = 0.06
selected_features_correlation = sorted_correlation_with_target[sorted_correlation_with_target > threshold].index.tolist()
print(selected_features_correlation)

['age', 'urea', 'weight', 'hco3', 'respiratory_rate', 'po2', 'creatinine', 'bilirubin']


### Univariate feature selection

In [19]:
# Univariate feature selection (selecting best features based on univariate statistical tests), using F-test to sort features
selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(train[X], train[y])

scores_univariate = pd.DataFrame({'Feature': train[X].columns, 'Scores': selector.scores_})
scores_univariate = scores_univariate.sort_values(by='Scores', ascending=False)
print(scores_univariate)

                     Feature     Scores
0                        age  84.443339
14                      urea  59.447519
2                     weight  32.779576
16                      hco3  30.374696
21          respiratory_rate  23.121569
5                        po2  14.909193
12                creatinine  14.849424
11                 bilirubin  13.048181
6                       pco2  12.321206
9            lung_compliance  12.269456
7                         ph  10.131559
3                     height   5.330771
18             minute_volume   3.791267
17                heart_rate   2.486079
15                      fio2   2.166501
1                        sex   2.090827
23  diastolic_blood_pressure   1.715010
20          plateau_pressure   1.650136
13                 platelets   1.647412
10                       map   0.997224
8           driving_pressure   0.788618
4                   pf_ratio   0.482380
22       syst_blood_pressure   0.307665
19                      peep   0.055993


In [20]:
# Select features that have f-test score > threshold
threshold = 10
selected_features_univariate = scores_univariate[scores_univariate['Scores'] > threshold]['Feature'].tolist()
print(selected_features_univariate)

['age', 'urea', 'weight', 'hco3', 'respiratory_rate', 'po2', 'creatinine', 'bilirubin', 'pco2', 'lung_compliance', 'ph']


### Recursive feature elimination (with cross-validation)

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# Recursive feature elimination (given an external estimator that assigns weights to features, the goal of RFE is to select features by recursively considering smaller and smaller sets of features)
min_features_to_select = 1 
clf = LogisticRegression()
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)

# Scaling Data
scaler = StandardScaler()
X_train = scaler.fit_transform(train[X])

rfecv.fit(X_train, train[y])
selected_features_recursive = train[X].columns[rfecv.support_].tolist()
print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Selected features: {selected_features_recursive}")

Optimal number of features: 10
Selected features: ['age', 'weight', 'pf_ratio', 'po2', 'driving_pressure', 'lung_compliance', 'bilirubin', 'urea', 'fio2', 'minute_volume']


### Tree-based feature selection

In [22]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(train[X], train[y])
importances = clf.feature_importances_  

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({'Feature': train[X].columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

                     Feature  Importance
0                        age    0.056777
14                      urea    0.050576
2                     weight    0.045784
11                 bilirubin    0.045729
7                         ph    0.044195
17                heart_rate    0.043668
16                      hco3    0.043123
18             minute_volume    0.043111
12                creatinine    0.043021
13                 platelets    0.042491
15                      fio2    0.042342
6                       pco2    0.042009
5                        po2    0.041790
21          respiratory_rate    0.041652
22       syst_blood_pressure    0.041556
10                       map    0.041356
23  diastolic_blood_pressure    0.041178
4                   pf_ratio    0.041007
20          plateau_pressure    0.040420
9            lung_compliance    0.040350
8           driving_pressure    0.038567
3                     height    0.038061
19                      peep    0.035504
1               

In [23]:
# Threshold for selection
threshold = 0.042
selected_features_tree = feature_importances[feature_importances['Importance'] > threshold]['Feature'].tolist()
print(selected_features_tree)

['age', 'urea', 'weight', 'bilirubin', 'ph', 'heart_rate', 'hco3', 'minute_volume', 'creatinine', 'platelets', 'fio2', 'pco2']


### Combining different feature selectors

In [24]:
from collections import Counter

all_selected_features = selected_features_correlation + selected_features_univariate + selected_features_recursive + selected_features_tree
feature_votes = Counter(all_selected_features)
selected_features_voting = [feature for feature, votes in feature_votes.items() if votes > 1]  # Example: more than 1 vote
print(selected_features_voting)

['age', 'urea', 'weight', 'hco3', 'respiratory_rate', 'po2', 'creatinine', 'bilirubin', 'pco2', 'lung_compliance', 'ph', 'fio2', 'minute_volume']
