In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2
# load the xs and y and data
data = pd.read_csv("cleaned_heart_data.csv")
X=data.drop(columns=['num'])
X_mm = np.load("X_mm.npy")
X_scaled = np.load("X_scaled.npy")
Y = np.load("y.npy")
#feature importance (random forest)
RF=RandomForestClassifier(random_state=42)
RF.fit(X,Y)
importances=pd.Series(RF.feature_importances_,index=X.columns).sort_values(ascending=False)
print(importances)

thalch                      0.154784
chol                        0.132067
age                         0.129143
oldpeak                     0.120599
cp_atypical angina          0.095059
trestbps                    0.092523
exang                       0.081038
cp_non-anginal              0.046338
sex_Male                    0.046044
slope_upsloping             0.019217
fbs                         0.018856
restecg_normal              0.018530
restecg_st-t abnormality    0.016467
slope_flat                  0.014789
cp_typical angina           0.014545
dtype: float64


In [3]:
#RFE
model=LogisticRegression(max_iter=2000)
rfe=RFE(model,n_features_to_select=5)
rfe.fit(X,Y)
selected_feature=X.columns[rfe.support_]
print(selected_feature)

Index(['exang', 'sex_Male', 'cp_atypical angina', 'cp_non-anginal',
       'cp_typical angina'],
      dtype='object')


In [4]:
#chi2
chi_scores,p_values=chi2(X_mm,Y)
for feature ,score ,p in zip(X.columns,chi_scores,p_values):
    print(f'{feature}: chi2={score:.2f},p-value={p:.4f}')

age: chi2=5.22,p-value=0.0223
trestbps: chi2=0.59,p-value=0.4444
chol: chi2=0.21,p-value=0.6439
fbs: chi2=9.13,p-value=0.0025
thalch: chi2=7.68,p-value=0.0056
exang: chi2=109.61,p-value=0.0000
oldpeak: chi2=26.97,p-value=0.0000
sex_Male: chi2=18.32,p-value=0.0000
cp_atypical angina: chi2=121.44,p-value=0.0000
cp_non-anginal: chi2=31.52,p-value=0.0000
cp_typical angina: chi2=2.61,p-value=0.1060
restecg_normal: chi2=3.21,p-value=0.0731
restecg_st-t abnormality: chi2=8.13,p-value=0.0044
slope_flat: chi2=2.52,p-value=0.1127
slope_upsloping: chi2=23.46,p-value=0.0000


In [7]:
# selecting the final features
# selection
top_rf=importances.sort_values(ascending=False).head(5).index.tolist()
RFE_selection=selected_feature.tolist()
chi2_selection=[feature for feature, p in zip(X.columns, p_values) if p < 0.05]

#final feature
final_feature=list(set(top_rf + RFE_selection + chi2_selection))
print(f'final feature are: {final_feature}')

X_selected=X[final_feature] # for modelling
#saving
np.save("X_selected.npy", X_selected)
np.save("final_feature.npy", final_feature)


final feature are: ['thalch', 'age', 'oldpeak', 'chol', 'sex_Male', 'cp_non-anginal', 'fbs', 'cp_typical angina', 'restecg_st-t abnormality', 'exang', 'slope_upsloping', 'cp_atypical angina']
