<a href="https://colab.research.google.com/github/Mansi06Salar/Coronary-Artery-Disease-Detection-using-AI/blob/main/Feature_Engineering_Hybrid_Approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv("PreProcessed_Dataset_MinMax.csv")
df.columns = [col.replace(" ", "_") for col in df.columns]

In [None]:
X = df.drop(columns=['CAD'])
y = df['CAD']

In [None]:
print(f"Before Feature Selection: {X.shape}")

Before Feature Selection: (303, 53)


In [None]:
# XGBoost Feature Importance
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X, y)
xgb_importance = pd.DataFrame({"Feature": X.columns, "XGB_Importance": xgb_model.feature_importances_})

In [None]:
# Recursive Feature Elimination (RFE) with RandomForest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(rf_model, n_features_to_select=15)
rfe.fit(X, y)
rfe_features = X.columns[rfe.support_]

In [None]:
# Mutual Information Scores
mi_scores = mutual_info_classif(X, y)
mi_importance = pd.DataFrame({"Feature": X.columns, "MI_Score": mi_scores})

In [None]:
# Hybrid Feature Selection
feature_scores = xgb_importance.merge(mi_importance, on="Feature")
feature_scores["Final_Score"] = (feature_scores["XGB_Importance"] * 0.5 + feature_scores["MI_Score"] * 0.3)
top_features = feature_scores.nlargest(15, "Final_Score")["Feature"].tolist()
final_features = list(set(top_features + list(rfe_features)))

In [None]:
X_reduced = X[final_features].copy()
X_reduced.loc[:, 'CAD'] = y  #Fixing the SettingWithCopyWarning
X_reduced.to_csv("Reduced_CAD_Hybrid.csv", index=False)

In [None]:
print(f"After Feature Selection: {X_reduced.shape}")
print(f"Hybrid Feature Selection Done. Final {len(final_features)} features saved.")

After Feature Selection: (303, 22)
Hybrid Feature Selection Done. Final 21 features saved.


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("PreProcessed_Dataset_MinMax.csv")

# Replace spaces with underscores in column names
df.columns = [col.replace(" ", "_") for col in df.columns]

# Separate the target variable 'CAD' from the features
X = df.drop(columns=['CAD'])
y = df['CAD']

print(f"Before Feature Selection: {X.shape}")

# Correlation Check
corr_matrix = X.corr()

# Identify highly correlated features (correlation > 0.9 or < -0.9)
high_corr_vars = np.where(abs(corr_matrix) > 0.9)

# Create a list of pairs of highly correlated features
high_corr_pairs = [(X.columns[i], X.columns[j]) for i, j in zip(*high_corr_vars) if i != j and i < j]

print("Highly correlated feature pairs (> 0.9):")
for pair in high_corr_pairs:
    print(pair)

# Drop one of the features from each highly correlated pair to avoid multicollinearity
features_to_drop = [pair[1] for pair in high_corr_pairs]
X = X.drop(columns=features_to_drop)

print(f"After Correlation Removal: {X.shape}")

# XGBoost Feature Importance
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X, y)
xgb_importance = pd.DataFrame({"Feature": X.columns, "XGB_Importance": xgb_model.feature_importances_})

# Recursive Feature Elimination (RFE) with RandomForest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(rf_model, n_features_to_select=15)
rfe.fit(X, y)
rfe_features = X.columns[rfe.support_]

# Mutual Information Scores
mi_scores = mutual_info_classif(X, y)
mi_importance = pd.DataFrame({"Feature": X.columns, "MI_Score": mi_scores})

# Hybrid Feature Selection
feature_scores = xgb_importance.merge(mi_importance, on="Feature")
feature_scores["Final_Score"] = (feature_scores["XGB_Importance"] * 0.5 + feature_scores["MI_Score"] * 0.3)

# Select top 15 features based on hybrid score
top_features = feature_scores.nlargest(15, "Final_Score")["Feature"].tolist()

# Combine top features from XGBoost importance, RFE, and MI
final_features = list(set(top_features + list(rfe_features)))

# Final reduced dataset
X_reduced = X[final_features].copy()
X_reduced.loc[:, 'CAD'] = y  # Fixing the SettingWithCopyWarning

# Save the reduced dataset
X_reduced.to_csv("Reduced_CAD_Hybrid.csv", index=False)

print(f"After Feature Selection: {X_reduced.shape}")
print(f"Hybrid Feature Selection Done. Final {len(final_features)} features saved.")


Before Feature Selection: (303, 53)
Highly correlated feature pairs (> 0.9):
('Lymph', 'Neut')
After Correlation Removal: (303, 52)
After Feature Selection: (303, 23)
Hybrid Feature Selection Done. Final 22 features saved.
