<a href="https://colab.research.google.com/github/Mansi06Salar/Coronary-Artery-Disease-Detection-using-AI/blob/main/CAD_using_AI_Pre_processing_Dataset_(Min_Max_scaling)%2C_Base_models%2C_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
```

**Basic Pre-processing**

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [None]:
df = pd.read_csv("CAD.csv")

print("Initial Dataset Shape:", df.shape)
print("Missing Values Before Processing:\n", df.isnull().sum())

Initial Dataset Shape: (303, 55)
Missing Values Before Processing:
 Age                      0
Weight                   0
Length                   0
Sex                      0
BMI                      0
DM                       0
HTN                      0
Current Smoker           0
EX-Smoker                0
FH                       0
Obesity                  0
CRF                      0
CVA                      0
Airway disease           0
Thyroid Disease          0
CHF                      0
DLP                      0
BP                       0
PR                       0
Edema                    0
Weak Peripheral Pulse    0
Lung rales               0
Systolic Murmur          0
Diastolic Murmur         0
Typical Chest Pain       0
Dyspnea                  0
Function Class           0
Atypical                 0
Nonanginal               0
Exertional CP            0
LowTH Ang                0
Q Wave                   0
St Elevation             0
St Depression            0
Tinversion    

In [None]:
df.drop(columns=["Exertional CP"], inplace=True)
print("Dropped column: Exertional CP")

df.rename(columns={"Length": "Height", "Cath": "CAD"}, inplace=True)
print("Renamed columns: Length → Height, Cath → CAD")

Dropped column: Exertional CP
Renamed columns: Length → Height, Cath → CAD


In [None]:
df['Sex'] = df['Sex'].replace({'Fmale': 'Female'})
df['Sex'] = df['Sex'].map({'Male': 1, 'Female': 0})

df['CAD'] = df['CAD'].map({'Cad': 1, 'Normal': 0})
print("Encoded Target Variable CAD")

Encoded Target Variable CAD


In [None]:
binary_columns = [
    'Obesity', 'CRF', 'CVA', 'Airway disease', 'Thyroid Disease',
    'CHF', 'DLP', 'Weak Peripheral Pulse', 'Lung rales', 'Systolic Murmur', 'Diastolic Murmur',
    'Dyspnea', 'Atypical', 'Nonanginal', 'LowTH Ang', 'LVH', 'Poor R Progression'
]

In [None]:
label_encoder = LabelEncoder()
for col in binary_columns:
    df[col] = label_encoder.fit_transform(df[col])
    print(f"Label Encoded Column: {col}")

df['VHD'] = df['VHD'].map({'N': 0, 'mild': 1, 'Moderate': 2, 'Severe': 3})
print("Applied Ordinal Encoding to VHD")

Label Encoded Column: Obesity
Label Encoded Column: CRF
Label Encoded Column: CVA
Label Encoded Column: Airway disease
Label Encoded Column: Thyroid Disease
Label Encoded Column: CHF
Label Encoded Column: DLP
Label Encoded Column: Weak Peripheral Pulse
Label Encoded Column: Lung rales
Label Encoded Column: Systolic Murmur
Label Encoded Column: Diastolic Murmur
Label Encoded Column: Dyspnea
Label Encoded Column: Atypical
Label Encoded Column: Nonanginal
Label Encoded Column: LowTH Ang
Label Encoded Column: LVH
Label Encoded Column: Poor R Progression
Applied Ordinal Encoding to VHD


In [None]:
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_cols.remove("CAD")
num_cols.remove("VHD")

In [None]:
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
print("Min-Max Scaling applied to:", num_cols)

print("Final Dataset Shape:", df.shape)
print("Missing Values After Processing:\n", df.isnull().sum())


Min-Max Scaling applied to: ['Age', 'Weight', 'Height', 'Sex', 'BMI', 'DM', 'HTN', 'Current Smoker', 'EX-Smoker', 'FH', 'Obesity', 'CRF', 'CVA', 'Airway disease', 'Thyroid Disease', 'CHF', 'DLP', 'BP', 'PR', 'Edema', 'Weak Peripheral Pulse', 'Lung rales', 'Systolic Murmur', 'Diastolic Murmur', 'Typical Chest Pain', 'Dyspnea', 'Function Class', 'Atypical', 'Nonanginal', 'LowTH Ang', 'Q Wave', 'St Elevation', 'St Depression', 'Tinversion', 'LVH', 'Poor R Progression', 'FBS', 'CR', 'TG', 'LDL', 'HDL', 'BUN', 'ESR', 'HB', 'K', 'Na', 'WBC', 'Lymph', 'Neut', 'PLT', 'EF-TTE', 'Region RWMA']
Final Dataset Shape: (303, 54)
Missing Values After Processing:
 Age                      0
Weight                   0
Height                   0
Sex                      0
BMI                      0
DM                       0
HTN                      0
Current Smoker           0
EX-Smoker                0
FH                       0
Obesity                  0
CRF                      0
CVA                 

In [None]:
df.to_csv("PreProcessed_Dataset_MinMax.csv", index=False)
print("Preprocessing Complete.")

Preprocessing Complete.


-------------------------------------------------------------------------------



```
# This is formatted as code
```

**Applying 10 ML model**

--------------------------------------------------------------------------------

In [None]:
!pip install catboost



In [None]:
import numpy as np
import pandas as pd
import time
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,classification_report
)
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
df = pd.read_csv("PreProcessed_Dataset_MinMax.csv")
print(df.head())

        Age    Weight    Height  Sex       BMI   DM  HTN  Current Smoker  \
0  0.410714  0.583333  0.729167  1.0  0.494721  0.0  1.0             1.0   
1  0.660714  0.305556  0.354167  0.0  0.451314  0.0  1.0             0.0   
2  0.428571  0.083333  0.500000  1.0  0.086105  0.0  0.0             1.0   
3  0.642857  0.263889  0.375000  0.0  0.382846  0.0  1.0             0.0   
4  0.357143  0.541667  0.270833  0.0  0.836058  0.0  1.0             0.0   

   EX-Smoker   FH  ...         K        Na       WBC     Lymph      Neut  \
0        0.0  0.0  ...  0.472222  0.464286  0.139860  0.603774  0.350877   
1        0.0  0.0  ...  0.472222  1.000000  0.279720  0.584906  0.403509   
2        0.0  0.0  ...  0.472222  0.392857  0.258741  0.584906  0.491228   
3        0.0  0.0  ...  0.388889  0.500000  0.650350  0.207547  0.701754   
4        0.0  0.0  ...  0.277778  0.428571  0.384615  0.905660  0.122807   

        PLT    EF-TTE  Region RWMA  VHD  CAD  
0  0.329149  0.777778          0.0    0

In [None]:
#Fix LightGBM warning: Remove spaces from feature names
df.columns = [col.replace(" ", "_") for col in df.columns]

In [None]:
X = df.drop(columns=['CAD'])
y = df['CAD']

In [None]:
print("Class Distribution BEFORE SMOTE:", Counter(y))

Class Distribution BEFORE SMOTE: Counter({1: 216, 0: 87})


In [None]:
smote_applied = False
imbalance_threshold = 0.6  #SMOTE if any class is <40% of the largest class

class_counts = Counter(y)
minority_class = min(class_counts, key=class_counts.get)
majority_class = max(class_counts, key=class_counts.get)
imbalance_ratio = class_counts[minority_class] / class_counts[majority_class]

if imbalance_ratio < imbalance_threshold:
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    smote_applied = True

print("Class Distribution AFTER SMOTE:", Counter(y) if smote_applied else "SMOTE Not Applied")

Class Distribution AFTER SMOTE: Counter({1: 216, 0: 216})


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    "LightGBM": LGBMClassifier(max_depth=10, min_data_in_leaf=5),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=50),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naïve Bayes": GaussianNB()
}

In [None]:
for name, model in models.items():
    try:
        print(f"\n================== {name} ==================")
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time

        y_pred = model.predict(X_test)

        # Check if predict_proba is available
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            y_prob = None

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="binary", zero_division=1)
        recall = recall_score(y_test, y_pred, average="binary")
        f1 = f1_score(y_test, y_pred, average="binary")
        auc = roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan

        results.append([name, accuracy, precision, recall, f1, auc, train_time])

        # Print detailed classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, zero_division=1))

    except Exception as e:
        print(f"Error training {name}: {e}")



Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90        44
           1       0.90      0.88      0.89        43

    accuracy                           0.90        87
   macro avg       0.90      0.90      0.90        87
weighted avg       0.90      0.90      0.90        87



Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.84      0.87        44
           1       0.85      0.91      0.88        43

    accuracy                           0.87        87
   macro avg       0.88      0.87      0.87        87
weighted avg       0.88      0.87      0.87        87



Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        44
           1       0.95      0.88      0.92        43

    accuracy                           0.92        87
   macro avg       0.92      0.92      0.92        87

Parameters: { "use_label_encoder" } are not used.




Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        44
           1       0.90      0.81      0.85        43

    accuracy                           0.86        87
   macro avg       0.87      0.86      0.86        87
weighted avg       0.87      0.86      0.86        87



Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91        44
           1       0.93      0.88      0.90        43

    accuracy                           0.91        87
   macro avg       0.91      0.91      0.91        87
weighted avg       0.91      0.91      0.91        87



Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88        44
           1       0.90      0.84      0.87        43

    accuracy                           0.87        87
   macro avg       0.88      0.87      0.87        87


In [None]:
# Create DataFrame for summary
metrics_df = pd.DataFrame(
    results,
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "AUROC", "Train Time"]
)

In [None]:
# Sort by Accuracy descending
metrics_df = metrics_df.sort_values(by="Accuracy", ascending=False)

# Remove duplicates based on Model name, keep the first (highest accuracy) one
metrics_df = metrics_df.drop_duplicates(subset=["Model"], keep="first").reset_index(drop=True)

# Limit to top 10 models
metrics_df = metrics_df.head(10)

In [None]:
print("\n================== Model Performance Summary ==================")
print(metrics_df.to_string(index=False))


              Model  Accuracy  Precision   Recall  F1 Score    AUROC  Train Time
            XGBoost  0.919540   0.928571 0.906977  0.917647 0.979387    1.576789
      Random Forest  0.919540   0.950000 0.883721  0.915663 0.978594    0.251024
                SVM  0.908046   0.926829 0.883721  0.904762 0.956131    0.025559
           CatBoost  0.908046   0.926829 0.883721  0.904762 0.974101    5.065156
Logistic Regression  0.896552   0.904762 0.883721  0.894118 0.949789    0.095062
      Decision Tree  0.896552   0.869565 0.930233  0.898876 0.894556    0.029436
           AdaBoost  0.873563   0.900000 0.837209  0.867470 0.958245    0.150244
           LightGBM  0.862069   0.897436 0.813953  0.853659 0.969345    0.127442
                KNN  0.850575   0.968750 0.720930  0.826667 0.869450    0.002869
        Naïve Bayes  0.597701   0.900000 0.209302  0.339623 0.887685    0.003727


----------------------------------------------------------------------------------------

*Feature* Engineering

------------------------------------------------------------------------------

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv("PreProcessed_Dataset_MinMax.csv")
df.columns = [col.replace(" ", "_") for col in df.columns]

In [None]:
X = df.drop(columns=['CAD'])
y = df['CAD']

In [None]:
print(f"Before Feature Selection: {X.shape}")

Before Feature Selection: (303, 53)


In [None]:
# XGBoost Feature Importance
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X, y)
xgb_importance = pd.DataFrame({"Feature": X.columns, "XGB_Importance": xgb_model.feature_importances_})

In [None]:
# Recursive Feature Elimination (RFE) with RandomForest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(rf_model, n_features_to_select=15)
rfe.fit(X, y)
rfe_features = X.columns[rfe.support_]

In [None]:
# Mutual Information Scores
mi_scores = mutual_info_classif(X, y)
mi_importance = pd.DataFrame({"Feature": X.columns, "MI_Score": mi_scores})

In [None]:
# Hybrid Feature Selection
feature_scores = xgb_importance.merge(mi_importance, on="Feature")
feature_scores["Final_Score"] = (feature_scores["XGB_Importance"] * 0.5 + feature_scores["MI_Score"] * 0.3)
top_features = feature_scores.nlargest(15, "Final_Score")["Feature"].tolist()
final_features = list(set(top_features + list(rfe_features)))

In [None]:
X_reduced = X[final_features].copy()
X_reduced.loc[:, 'CAD'] = y  #Fixing the SettingWithCopyWarning
X_reduced.to_csv("Reduced_Dataset_Hybrid_Approach", index=False)

In [None]:
print(f"After Feature Selection: {X_reduced.shape}")
print(f"Hybrid Feature Selection Done. Final {len(final_features)} features saved.")

After Feature Selection: (303, 24)
Hybrid Feature Selection Done. Final 23 features saved.


In [None]:
df = pd.read_csv("Reduced_Dataset_Hybrid_Approach") #NEW REDUCED DATASET OBTAINED

Now applying 10 ML classifiers to the generated reduced dataset

In [None]:
df.columns = [col.replace(" ", "_") for col in df.columns]

X = df.drop(columns=['CAD'])
y = df['CAD']
print("Class Distribution BEFORE SMOTE:", Counter(y))

smote_applied = False
imbalance_threshold = 0.6  #SMOTE if any class is <40% of the largest class

class_counts = Counter(y)
minority_class = min(class_counts, key=class_counts.get)
majority_class = max(class_counts, key=class_counts.get)
imbalance_ratio = class_counts[minority_class] / class_counts[majority_class]

if imbalance_ratio < imbalance_threshold:
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    smote_applied = True

print("Class Distribution AFTER SMOTE:", Counter(y) if smote_applied else "SMOTE Not Applied")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Class Distribution BEFORE SMOTE: Counter({1: 216, 0: 87})
Class Distribution AFTER SMOTE: Counter({1: 216, 0: 216})


In [None]:
results = []
for name, model in models.items():
    try:
        print(f"\n================== {name} ==================")
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time

        y_pred = model.predict(X_test)

        # Check if predict_proba is available
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            y_prob = None

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="binary", zero_division=1)
        recall = recall_score(y_test, y_pred, average="binary")
        f1 = f1_score(y_test, y_pred, average="binary")
        auc = roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan

        results.append([name, accuracy, precision, recall, f1, auc, train_time])

        # Print detailed classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, zero_division=1))

    except Exception as e:
        print(f"Error training {name}: {e}")



Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        44
           1       0.88      0.88      0.88        43

    accuracy                           0.89        87
   macro avg       0.89      0.89      0.89        87
weighted avg       0.89      0.89      0.89        87



Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83        44
           1       0.86      0.74      0.80        43

    accuracy                           0.82        87
   macro avg       0.82      0.82      0.81        87
weighted avg       0.82      0.82      0.82        87



Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        44
           1       0.93      0.86      0.89        43

    accuracy                           0.90        87
   macro avg       0.90      0.90      0.90        87

Parameters: { "use_label_encoder" } are not used.




Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        44
           1       0.95      0.93      0.94        43

    accuracy                           0.94        87
   macro avg       0.94      0.94      0.94        87
weighted avg       0.94      0.94      0.94        87


[LightGBM] [Info] Number of positive: 173, number of negative: 172
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 345, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501449 -> initscore=0.005797
[LightGBM] [Info] Start training from score 0.005797

Classification Report:
              precision    recall  f1-score   support

           0       0

In [None]:
# Create DataFrame for summary
metrics_df = pd.DataFrame(
    results,
    columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "AUROC", "Train Time"]
)

In [None]:
# Sort by Accuracy descending
metrics_df = metrics_df.sort_values(by="Accuracy", ascending=False)

# Remove duplicates based on Model name, keep the first (highest accuracy) one
metrics_df = metrics_df.drop_duplicates(subset=["Model"], keep="first").reset_index(drop=True)

# Limit to top 10 models
metrics_df = metrics_df.head(10)

In [None]:
print("\n================== Model Performance Summary ==================")
print(metrics_df.to_string(index=False))


              Model  Accuracy  Precision   Recall  F1 Score    AUROC  Train Time
            XGBoost  0.942529   0.952381 0.930233  0.941176 0.979915    0.752237
           AdaBoost  0.931034   0.930233 0.930233  0.930233 0.969873    0.287196
           CatBoost  0.919540   0.928571 0.906977  0.917647 0.974101    7.325948
      Random Forest  0.896552   0.925000 0.860465  0.891566 0.970402    0.470626
           LightGBM  0.896552   0.925000 0.860465  0.891566 0.964059    0.176407
Logistic Regression  0.885057   0.883721 0.883721  0.883721 0.936575    0.013854
        Naïve Bayes  0.885057   0.883721 0.883721  0.883721 0.950317    0.008999
                SVM  0.862069   0.844444 0.883721  0.863636 0.939746    0.034225
                KNN  0.827586   0.868421 0.767442  0.814815 0.914376    0.004976
      Decision Tree  0.816092   0.864865 0.744186  0.800000 0.815275    0.018064
