# EM23_RedClave_Task1_HeartAttack.ipynb


In [None]:
# === Config - change these before running ===
TEAM_CODE = "TC01"                # Replace with your TeamCode (e.g., TC01)
TEAM_NAME = "TeamRapidAid"        # Replace with your TeamName (no spaces preferred)
TRAIN_PATH = "Heart_Attack_training_dataset.csv"   # path to training CSV (upload to Colab)
TEST_PATH = "Hear_Attack_evaluation_dataset.csv"   # path to test CSV (upload to Colab)

OUTPUT_NOTEBOOK = f"{TEAM_CODE}_{TEAM_NAME}_Task1_HeartAttack.ipynb"
METRICS_PNG = f"{TEAM_CODE}_{TEAM_NAME}_Task1_Metrics.png"
PRED_CSV = f"{TEAM_CODE}_{TEAM_NAME}_Task1_Predictions.csv"

print("Output files will be:", METRICS_PNG, "and", PRED_CSV)


In [None]:
# Install extras if running on a fresh Colab environment
try:
    import xgboost
except Exception as e:
    print('xgboost not found. Installing...')
    !pip install -q xgboost
    import xgboost
print('Required libs available.')


In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

print('Libraries imported')

## 1) Load data and quick EDA



In [None]:
# Load datasets
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print('Train shape:', train.shape)
print('Test shape:', test.shape)
display(train.head())
display(train.info())
display(train.describe(include='all').T)


## 2) Preprocessing plan

- Parse `bp` (systolic/diastolic) into two numeric columns: `bp_sys`, `bp_dia`.
- Handle missing values: numeric -> median, categorical -> most frequent.
- Encode categorical features: Ordinal/One-hot as appropriate.
- Scale numeric features using StandardScaler.

We will create a preprocessing pipeline and then train models inside a pipeline to avoid leakage.

In [None]:
# Preprocessing and feature engineering
def preprocess_dfs(df):
    df = df.copy()
    # parse bp column
    if 'bp' in df.columns:
        bp_split = df['bp'].astype(str).str.split('/', expand=True)
        df['bp_sys'] = pd.to_numeric(bp_split[0], errors='coerce')
        df['bp_dia'] = pd.to_numeric(bp_split[1], errors='coerce')
        df.drop(columns=['bp'], inplace=True)
    # ensure target exists in train
    return df

train = preprocess_dfs(train)
test = preprocess_dfs(test)

# Identify feature lists
target = 'heart_attack_risk'
id_col = 'patient_id'
features = [c for c in train.columns if c not in [target, id_col]]

# automatic dtype inference for categorical vs numeric
num_cols = train[features].select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = [c for c in features if c not in num_cols]

print('Numeric cols:', num_cols)
print('Categorical cols:', cat_cols)


In [None]:
# Numeric pipeline 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

## 3) Train/Test split and model comparison

We'll train three candidate models: Logistic Regression, Random Forest, and XGBoost. We will prioritize Recall (since competition scoring uses Recall). We'll use stratified split and report multiple metrics.

In [None]:
# Split training data into train/validation (stratified)
X = train[features].copy()
y = train[target].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

print('Shapes:', X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# Build pipelines for models
pipelines = {
    'logreg': Pipeline(steps=[('pre', preprocessor), ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))]),
    'rf': Pipeline(steps=[('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42))]),
    'xgb': Pipeline(steps=[('pre', preprocessor), ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))])
}

# Quick train and evaluate function
def evaluate_model(pipeline, X_tr, y_tr, X_v, y_v, name='model'):
    pipeline.fit(X_tr, y_tr)
    preds = pipeline.predict(X_v)
    probs = pipeline.predict_proba(X_v)[:,1] if hasattr(pipeline, 'predict_proba') else None
    acc = accuracy_score(y_v, preds)
    prec = precision_score(y_v, preds, zero_division=0)
    rec = recall_score(y_v, preds)
    f1 = f1_score(y_v, preds)
    roc = roc_auc_score(y_v, probs) if probs is not None else np.nan
    print(f"=== {name} ===\nAccuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}, ROC-AUC: {roc:.4f}\n")
    return {'name': name, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1, 'roc_auc': roc, 'pipeline': pipeline, 'preds': preds}

results = []
for name, pipe in pipelines.items():
    print('Training', name)
    res = evaluate_model(pipe, X_train, y_train, X_val, y_val, name=name)
    results.append(res)


In [None]:
# Create metrics DataFrame and save as PNG
metrics_df = pd.DataFrame([{'Model':r['name'],'Accuracy':r['accuracy'],'Precision':r['precision'],'Recall':r['recall'],'F1':r['f1'],'ROC-AUC':r['roc_auc']} for r in results])
metrics_df = metrics_df.sort_values('Recall', ascending=False).reset_index(drop=True)
display(metrics_df.style.format({'Accuracy':'{:.4f}','Precision':'{:.4f}','Recall':'{:.4f}','F1':'{:.4f}','ROC-AUC':'{:.4f}'}))

# Save a figure with the table for screenshot requirement
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,2 + 0.6*len(metrics_df)))
ax.axis('off')
tbl = ax.table(cellText=np.round(metrics_df[['Accuracy','Precision','Recall','F1','ROC-AUC']].values,4),
               rowLabels=metrics_df['Model'].values,
               colLabels=['Accuracy','Precision','Recall','F1','ROC-AUC'],
               cellLoc='center', loc='center')
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
plt.title('Model evaluation metrics (validation)')
plt.tight_layout()
plt.savefig(METRICS_PNG, dpi=200)
print('Saved metrics image to', METRICS_PNG)
plt.show()


## 4) Hyperparameter tuning (optional but recommended)

Because Recall is most important, we tune models using Recall as the scoring metric. We'll demonstrate GridSearchCV for RandomForest and XGBoost with stratified CV.

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

rf_params = {'clf__n_estimators':[100,200], 'clf__max_depth':[None,10,20]}
xgb_params = {'clf__n_estimators':[100,200], 'clf__max_depth':[3,6], 'clf__scale_pos_weight':[1,5]}

best_pipelines = {}

print('Tuning RandomForest...')
gs_rf = GridSearchCV(pipelines['rf'], rf_params, scoring='recall', cv=cv, n_jobs=-1, verbose=1)
gs_rf.fit(X_train, y_train)
print('Best RF params:', gs_rf.best_params_, 'best recall:', gs_rf.best_score_)
best_pipelines['rf'] = gs_rf.best_estimator_

print('\nTuning XGBoost...')
gs_xgb = GridSearchCV(pipelines['xgb'], xgb_params, scoring='recall', cv=cv, n_jobs=-1, verbose=1)
gs_xgb.fit(X_train, y_train)
print('Best XGB params:', gs_xgb.best_params_, 'best recall:', gs_xgb.best_score_)
best_pipelines['xgb'] = gs_xgb.best_estimator_

# Evaluate best models on validation set
tuned_results = []
for name, pipe in best_pipelines.items():
    tuned_results.append(evaluate_model(pipe, X_train, y_train, X_val, y_val, name='tuned_'+name))

tuned_df = pd.DataFrame([{'Model':r['name'],'Accuracy':r['accuracy'],'Precision':r['precision'],'Recall':r['recall'],'F1':r['f1'],'ROC-AUC':r['roc_auc']} for r in tuned_results])
display(tuned_df)


## 5) Final model training

Choose the best model (based on Recall on validation). Retrain it on the full training dataset and generate predictions for the test set. Then save predictions CSV with required format.

In [None]:
# Select the best model by recall from tuned + baseline results
all_results = results + tuned_results if 'tuned_results' in globals() else results
best = max(all_results, key=lambda x: x['recall'])
print('Selected model for final predictions:', best['name'])

# Retrain selected model on full training data
final_pipeline = best['pipeline']
final_pipeline.fit(X, y)  # train on full training set

# Prepare test features (ensure same columns order)
X_test = test[[c for c in test.columns if c != id_col]].copy()

# Predict
test_preds = final_pipeline.predict(X_test)
test_preds = (test_preds > 0.5).astype(int) if test_preds.dtype==bool else test_preds

# Build submission DataFrame
submission = pd.DataFrame({id_col: test[id_col], 'heart_attack_risk': test_preds})
submission.to_csv(PRED_CSV, index=False)
print('Saved predictions to', PRED_CSV)
display(submission.head())
