In [7]:
# --- Setup (paths, imports) ---
BASE_DIR  = r"E:\AIML"
DATA_PATH = rf"E:\AIML\dataset\heart_attack_prediction_dataset.csv"
EDA_DIR   = rf"E:\AIML\results\eda_visualizations"
OUT_DIR   = rf"E:\AIML\results\outputs"

import os, numpy as np, pandas as pd, matplotlib.pyplot as plt, warnings
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA

try:
    from imblearn.over_sampling import SMOTE
except Exception:
    SMOTE = None  # Only needed for Member F

warnings.filterwarnings("ignore")
os.makedirs(EDA_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

TARGET = "Heart Attack Risk"
RANDOM_STATE = 42


In [8]:
# --- Load & Split (before any fitting) ---
df = pd.read_csv(DATA_PATH)
assert TARGET in df.columns, f"TARGET '{TARGET}' not found. Columns: {list(df.columns)}"

# optional: drop ID-like columns
id_like = {"patient id","id","record id"}
drop_ids = [c for c in df.columns if c.strip().lower() in id_like]
if drop_ids:
    df = df.drop(columns=drop_ids)

X = df.drop(columns=[TARGET]).copy()
y = df[TARGET].values
strat = y if pd.Series(y).nunique() <= 20 else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=strat
)
print("Train/Test:", X_train.shape, X_test.shape)

Train/Test: (7010, 24) (1753, 24)


In [9]:
# --- Detect categorical vs numeric ---
LOW_CARD_AS_CAT = 12
categorical_cols = []
for c in X_train.columns:
    if X_train[c].dtype == "object":
        categorical_cols.append(c)
    else:
        if X_train[c].nunique(dropna=True) <= LOW_CARD_AS_CAT:
            categorical_cols.append(c)
numeric_cols = [c for c in X_train.select_dtypes(include="number").columns if c not in categorical_cols]
print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)

Categorical: ['Sex', 'Blood Pressure', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Diet', 'Previous Heart Problems', 'Medication Use', 'Stress Level', 'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country', 'Continent', 'Hemisphere']
Numeric: ['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides']


In [10]:
# --- Pre-req: Impute (load if available; else compute) ---
import os

train_imp_path = os.path.join(OUT_DIR, "X_train_imputed.csv")
test_imp_path  = os.path.join(OUT_DIR, "X_test_imputed.csv")

if os.path.exists(train_imp_path) and os.path.exists(test_imp_path):
    print("Loading imputed splits from outputs/...")
    X_train_imp = pd.read_csv(train_imp_path).drop(columns=[TARGET], errors="ignore")
    X_test_imp  = pd.read_csv(test_imp_path).drop(columns=[TARGET], errors="ignore")
else:
    print("Imputing locally (median numeric, most_frequent categorical)...")
    num_imputer = SimpleImputer(strategy="median")
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X_train_num = num_imputer.fit_transform(X_train[numeric_cols]) if numeric_cols else None
    X_test_num  = num_imputer.transform(X_test[numeric_cols])      if numeric_cols else None
    X_train_cat = cat_imputer.fit_transform(X_train[categorical_cols]) if categorical_cols else None
    X_test_cat  = cat_imputer.transform(X_test[categorical_cols])      if categorical_cols else None
    tr_parts, te_parts = [], []
    if X_train_num is not None:
        tr_parts.append(pd.DataFrame(X_train_num, columns=numeric_cols, index=X_train.index))
        te_parts.append(pd.DataFrame(X_test_num,  columns=numeric_cols, index=X_test.index))
    if X_train_cat is not None:
        tr_parts.append(pd.DataFrame(X_train_cat, columns=categorical_cols, index=X_train.index))
        te_parts.append(pd.DataFrame(X_test_cat,  columns=categorical_cols, index=X_test.index))
    X_train_imp = pd.concat(tr_parts, axis=1) if tr_parts else pd.DataFrame(index=X_train.index)
    X_test_imp  = pd.concat(te_parts, axis=1) if te_parts else pd.DataFrame(index=X_test.index)

print("Imputed shapes:", X_train_imp.shape, X_test_imp.shape)

Imputing locally (median numeric, most_frequent categorical)...
Imputed shapes: (7010, 24) (1753, 24)


In [11]:
# --- E1: Ensure numeric-only matrix (OHE non-numerics if any) ---
# One-hot the non-numeric columns to make MI/PCA numeric-only
non_num_cols = [c for c in X_train_imp.columns if c not in X_train.select_dtypes(include="number").columns]
if non_num_cols:
    try:
        ohe_fe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe_fe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    Xtr_ohe = ohe_fe.fit_transform(X_train_imp[non_num_cols].astype("category"))
    Xte_ohe = ohe_fe.transform(X_test_imp[non_num_cols].astype("category"))
    try:
        ohe_cols = list(ohe_fe.get_feature_names_out(non_num_cols))
    except AttributeError:
        ohe_cols = [f"{c}_{v}" for c, cats in zip(non_num_cols, ohe_fe.categories_) for v in cats]
    df_tr_ohe = pd.DataFrame(Xtr_ohe, columns=ohe_cols, index=X_train_imp.index)
    df_te_ohe = pd.DataFrame(Xte_ohe, columns=ohe_cols, index=X_test_imp.index)
else:
    df_tr_ohe = pd.DataFrame(index=X_train_imp.index)
    df_te_ohe = pd.DataFrame(index=X_test_imp.index)

Xtr_num = X_train_imp.select_dtypes(include="number").copy()
Xte_num = X_test_imp.select_dtypes(include="number").copy()

X_train_FE = pd.concat([Xtr_num, df_tr_ohe], axis=1)
X_test_FE  = pd.concat([Xte_num, df_te_ohe], axis=1)
print("Numeric-only matrices -> TRAIN:", X_train_FE.shape, " TEST:", X_test_FE.shape)

Numeric-only matrices -> TRAIN: (7010, 3646)  TEST: (1753, 3646)


In [None]:
# --- E2: Mutual Information (Top-K bar) ---
TOP_K_MI = 15
mi_scores = mutual_info_classif(X_train_FE.values, y_train, discrete_features=False, random_state=RANDOM_STATE)
mi_df = pd.DataFrame({"feature": X_train_FE.columns, "mi": mi_scores}).sort_values("mi", ascending=False)

k = min(TOP_K_MI, len(mi_df))
topk = mi_df.head(k).iloc[::-1]

plt.figure(figsize=(8,6))
plt.barh(topk["feature"], topk["mi"])
plt.title(f"Top {k} Features by Mutual Information")
plt.xlabel("MI score"); plt.ylabel("Feature")
plt.tight_layout(); plt.savefig(os.path.join(EDA_DIR, f"E_mi_top_{k}.png")); plt.show()

mi_df.to_csv(os.path.join(OUT_DIR, "E_feature_importance_MI.csv"), index=False)



In [None]:
# --- E3: PCA (fit on TRAIN) + plots + transform to ~95% variance ---
pca = PCA(n_components=None, random_state=RANDOM_STATE).fit(X_train_FE.values)
exp = pca.explained_variance_ratio_; cum = np.cumsum(exp)

plt.figure(); plt.plot(range(1, len(exp)+1), exp, marker="o")
plt.title("PCA — Explained Variance Ratio per Component")
plt.xlabel("Principal Component"); plt.ylabel("Explained variance ratio")
plt.tight_layout(); plt.savefig(os.path.join(EDA_DIR, "E_pca_explained_variance.png")); plt.show()

plt.figure(); plt.plot(range(1, len(cum)+1), cum, marker="o"); plt.axhline(0.95, linestyle="--")
plt.title("PCA — Cumulative Explained Variance")
plt.xlabel("Number of Components"); plt.ylabel("Cumulative explained variance")
plt.tight_layout(); plt.savefig(os.path.join(EDA_DIR, "E_pca_cumulative_variance.png")); plt.show()

n95 = int(np.argmax(cum >= 0.95) + 1)
print(f"~95% variance retained with {n95} PCs.")

pca95 = PCA(n_components=n95, random_state=RANDOM_STATE).fit(X_train_FE.values)
X_train_pca = pca95.transform(X_train_FE.values)
X_test_pca  = pca95.transform(X_test_FE.values)

pd.DataFrame(X_train_pca, columns=[f"PC{i}" for i in range(1, n95+1)])\
  .to_csv(os.path.join(OUT_DIR, f"E_X_train_PCA_{n95}.csv"), index=False)
pd.DataFrame(X_test_pca,  columns=[f"PC{i}" for i in range(1, n95+1)])\
  .to_csv(os.path.join(OUT_DIR,  f"E_X_test_PCA_{n95}.csv"),  index=False)
print("Saved PCA CSVs.")