In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
# reload dataset
df = pd.read_csv("/content/drive/MyDrive/Heart_Disease_Project/data/heart_disease_cleaned.csv")

X = df.drop("target", axis=1)
y = df["target"]


# Random Forest Feature Importance

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=X.columns)
top_rf = importances.sort_values(ascending=False).head(8).index.tolist()
print("Top features by RF:", top_rf)


# RFE (Logistic Regression)

log_reg = LogisticRegression(max_iter=1000)
rfe = RFE(log_reg, n_features_to_select=8)
rfe.fit(X, y)

top_rfe = X.columns[rfe.support_].tolist()
print("Top features by RFE:", top_rfe)

# Chi-Square (requires non-negative data)

scaler_mm = MinMaxScaler()
X_mm = pd.DataFrame(scaler_mm.fit_transform(X), columns=X.columns)

chi2_selector = SelectKBest(chi2, k=8)
chi2_selector.fit(X_mm, y)

top_chi2 = X.columns[chi2_selector.get_support()].tolist()
print("Top features by Chi-Square:", top_chi2)

# Combine selected features

final_features = list(set(top_rf + top_rfe + top_chi2))
print("\n Final selected features:", final_features)

# create reduced dataset
df_reduced = df[final_features + ["target"]]

# save reduced dataset
df_reduced.to_csv("/content/drive/MyDrive/Heart_Disease_Project/data/heart_disease_selected.csv", index=False)
print("Reduced dataset saved:", df_reduced.shape)


Top features by RF: ['ca', 'thalach', 'oldpeak', 'cp_4.0', 'thal_7.0', 'age', 'chol', 'trestbps']
Top features by RFE: ['sex', 'exang', 'oldpeak', 'ca', 'cp_2.0', 'cp_4.0', 'slope_2.0', 'thal_7.0']
Top features by Chi-Square: ['exang', 'oldpeak', 'ca', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'slope_2.0', 'thal_7.0']

 Final selected features: ['chol', 'exang', 'thalach', 'cp_4.0', 'trestbps', 'age', 'cp_3.0', 'ca', 'oldpeak', 'slope_2.0', 'thal_7.0', 'sex', 'cp_2.0']
Reduced dataset saved: (297, 14)
