# AML Project 3

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC

# Engineer Features

In [2]:
def engineer_features(df):
    df = df.copy()
    
    df['age_group'] = pd.cut(
        df['Age'], bins=[0, 18, 30, 45, 60, 75, 120],
        labels=['Child', 'YoungAdult', 'Adult', 'Midlife', 'Senior', 'Elder']
    )
    
    df['bmi_category'] = pd.cut(
        df['BMI'], bins=[0, 18.5, 25, 30, 35, 100],
        labels=['Underweight', 'Normal', 'Overweight', 'Obese', 'SeverelyObese']
    )

    df['hypertension'] = ((df['Systolic'] > 130) | (df['Diastolic'] > 80)).astype(int)
    df['hypertension_cat'] = pd.cut(
        df['Systolic'], bins=[0, 120, 130, 140, 180],
        labels=['Normal', 'Elevated', 'Stage1', 'Stage2']
    )

    df['chol_ratio'] = df['HDL'] / (df['TCHOL'] + 1e-6)
    df['ldl_hdl_ratio'] = df['LDL'] / (df['HDL'] + 1e-6)
    df['tchol_hdl_ratio'] = df['TCHOL'] / (df['HDL'] + 1e-6)
    df['non_hdl_chol'] = df['TCHOL'] - df['HDL']

    df['pulse_pressure'] = df['Systolic'] - df['Diastolic']

    df['diabetes_bin'] = (df['Diabetes'] == 1).astype(int)
    df['diabetes_cat'] = df['Diabetes'].map({0: 'No', 1: 'Yes', 2: 'Borderline'}).astype('category')

    df['smoker_bin'] = df['CurrentSmoker'].map({0: 0, 1: 1}).fillna(0).astype(int)
    df['smoker_cat'] = df['CurrentSmoker'].map({0: 'No', 1: 'Yes', 2: 'Unknown'}).astype('category')

    if 'Insurance' in df.columns:
        df['Insurance'] = df['Insurance'].astype('category')
    if 'Race' in df.columns:
        df['Race'] = df['Race'].astype('category')

    df['age_bmi'] = df['Age'] * df['BMI']
    df['bmi_ldl'] = df['BMI'] * df['LDL']
    df['bp_bmi'] = (df['Systolic'] + df['Diastolic']) * df['BMI']
    df['hdl_ldl_ratio'] = df['HDL'] / (df['LDL'] + 1e-6)
    df['income_edu'] = df['Income'] * df['Edu']
    df['age_systolic'] = df['Age'] * df['Systolic']
    df['bmi_hdl'] = df['BMI'] * df['HDL']
    df['pulse_bmi'] = df['Pulse'] * df['BMI']
    df['age_tchol'] = df['Age'] * df['TCHOL']
    df['bmi_tchol'] = df['BMI'] * df['TCHOL']
    df['age_diastolic'] = df['Age'] * df['Diastolic']
    df['bmi_diastolic'] = df['BMI'] * df['Diastolic']
    df['age_pulse'] = df['Age'] * df['Pulse']
    df['bmi_pulse'] = df['BMI'] * df['Pulse']
    df['systolic_diastolic_ratio'] = df['Systolic'] / (df['Diastolic'] + 1e-6)
    df['tchol_ldl_ratio'] = df['TCHOL'] / (df['LDL'] + 1e-6)
    df['trig_hdl_ratio'] = df['Trig'] / (df['HDL'] + 1e-6)
    df['trig_ldl_ratio'] = df['Trig'] / (df['LDL'] + 1e-6)
    df['trig_tchol_ratio'] = df['Trig'] / (df['TCHOL'] + 1e-6)

    # Missing value indicators
    for col in ['Systolic', 'Diastolic', 'HDL', 'LDL', 'TCHOL']:
        df[f'is_{col.lower()}_missing'] = df[col].isnull().astype(int)

    numeric_cols = ['BMI', 'HDL', 'LDL', 'TCHOL', 'Trig', 'Pulse', 'eGFP',
                    'Systolic', 'Diastolic', 'Age', 'Income']
    for col in numeric_cols:
        if col in df.columns:
            df[col + '_norm'] = (df[col] - df[col].mean()) / (df[col].std() + 1e-6)

    # Clip extreme ratios
    ratio_cols = [c for c in df.columns if "ratio" in c]
    df[ratio_cols] = df[ratio_cols].clip(-10, 10)
    
    return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return engineer_features(X)

class Preprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        missing_ratio = X.isnull().mean()
        self.cols_to_drop_ = missing_ratio[missing_ratio > 0.3].index.tolist()

        X_tmp = X.drop(columns=self.cols_to_drop_, errors="ignore")
        numeric_cols = X_tmp.select_dtypes(include=np.number).columns
        categorical_cols = X_tmp.select_dtypes(include="object").columns

        self.numeric_means_ = X_tmp[numeric_cols].mean()
        self.categorical_modes_ = X_tmp[categorical_cols].mode().iloc[0] if len(categorical_cols) > 0 else pd.Series(dtype="object")

        return self

    def transform(self, X):
        X = X.drop(columns=self.cols_to_drop_, errors="ignore").copy()
        X = X.drop(columns=["ID"], errors="ignore")

        # Fill numeric missing values
        for col, mean in self.numeric_means_.items():
            if col in X.columns:
                X[col] = X[col].fillna(mean)

        # Fill categorical missing values
        for col, mode in self.categorical_modes_.items():
            if col in X.columns:
                X[col] = X[col].fillna(mode)

        return pd.get_dummies(X, drop_first=True)

# Load Data

In [3]:
df = pd.read_csv("P2_data_stroke_train.csv")
df = df.dropna(subset=["stroke"]).copy()
df["stroke"] = df["stroke"].replace({2: 0}).astype(int)

X = df.drop(columns=["stroke"])
y = df["stroke"]

# Train/Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

fe = FeatureEngineer()
X_train_fe = fe.fit_transform(X_train)
X_test_fe = fe.transform(X_test)

numeric_cols = X_train_fe.select_dtypes(include=np.number).columns
for col in numeric_cols:
    X_train_fe[col] = X_train_fe[col].fillna(X_train_fe[col].mean())
    X_test_fe[col] = X_test_fe[col].fillna(X_train_fe[col].mean())

# Preprocessing And Pipeline

In [5]:
common_steps = [
    ("feature_engineer", FeatureEngineer()),
    ("preprocessor", Preprocessor()),
    ("scaler", MinMaxScaler()),
    ("selector", SelectKBest(score_func=f_classif, k=40))
]

# Logistic Regression

In [6]:


log_reg_pipeline = ImbPipeline(
    steps=common_steps + [

        ("classifier", LogisticRegression(
            max_iter=1000, 
            solver='liblinear', 
            class_weight='balanced',
            penalty='l1',
            C=0.4
        ))
    ]
)

print("Training Logistic Regression")
log_reg_pipeline.fit(X_train, y_train)
y_pred_log = log_reg_pipeline.predict(X_test)
y_prob_log = log_reg_pipeline.predict_proba(X_test)[:, 1]

print("\nLogistic Regression Results")
print(classification_report(y_test, y_pred_log))
print("Log Loss:", log_loss(y_test, y_prob_log))
print(confusion_matrix(y_test, y_pred_log))

Training Logistic Regression

Logistic Regression Results
              precision    recall  f1-score   support

           0       0.99      0.72      0.83      2632
           1       0.11      0.88      0.20       108

    accuracy                           0.72      2740
   macro avg       0.55      0.80      0.52      2740
weighted avg       0.96      0.72      0.81      2740

Log Loss: 0.5305291100220426
[[1882  750]
 [  13   95]]


# Random Forest

In [7]:
rf_pipeline = ImbPipeline(
    steps=common_steps + [
        ("classifier", RandomForestClassifier(
            n_estimators=300,
            max_depth=6,
            min_samples_split=20,
            min_samples_leaf=10,
            class_weight='balanced',
            random_state=42,
        ))
    ]
)

print("\nTraining Random Forest")
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

print("\nRandom Forest Results")
print(classification_report(y_test, y_pred_rf))
print("Log Loss:", log_loss(y_test, y_prob_rf))
print(confusion_matrix(y_test, y_pred_rf))


Training Random Forest

Random Forest Results
              precision    recall  f1-score   support

           0       0.99      0.74      0.85      2632
           1       0.11      0.78      0.19       108

    accuracy                           0.74      2740
   macro avg       0.55      0.76      0.52      2740
weighted avg       0.95      0.74      0.82      2740

Log Loss: 0.4269119998222432
[[1950  682]
 [  24   84]]


# SVM

In [8]:
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE
svm = SVC(
        kernel='poly',
        C=0.05,
        gamma='scale',
        class_weight='balanced',
        probability=True,
        random_state=42
)

calibrated_svm = CalibratedClassifierCV(svm, method='sigmoid', cv=5)

svm_pipeline = ImbPipeline(steps=common_steps + [("smote", SMOTE(random_state=42)), ("classifier", calibrated_svm)])

print("\nTraining SVM")
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
y_prob_svm = svm_pipeline.predict_proba(X_test)[:, 1]

print("\nSVM Results")
print(classification_report(y_test, y_pred_svm))
print("Log Loss:", log_loss(y_test, y_prob_svm))
print(confusion_matrix(y_test, y_pred_svm))


Training SVM

SVM Results
              precision    recall  f1-score   support

           0       0.99      0.75      0.85      2632
           1       0.11      0.75      0.19       108

    accuracy                           0.75      2740
   macro avg       0.55      0.75      0.52      2740
weighted avg       0.95      0.75      0.82      2740

Log Loss: 0.46949616079651163
[[1968  664]
 [  27   81]]


# Test Data Prediction

In [9]:
test_df = pd.read_csv("P2_data_stroke_test.csv")
test_ids = test_df["Participant ID"].copy()

print("\nGenerating stroke probabilities for test data")

if 'stroke' in test_df.columns:
    test_df = test_df.drop('stroke', axis=1)

y_prob_log_test = log_reg_pipeline.predict_proba(test_df)[:, 1]
y_prob_rf_test = rf_pipeline.predict_proba(test_df)[:, 1]
y_prob_svm_test = svm_pipeline.predict_proba(test_df)[:, 1]

output = pd.DataFrame({
    "Participant ID": test_ids,
    "Logistic regression prediction": y_prob_log_test,
    "Random forest prediction": y_prob_rf_test,
    "SVM prediction": y_prob_svm_test
})

# Round probabilities to 2 decimal places
output = output.round({
    "Logistic regression prediction": 2,
    "Random forest prediction": 2,
    "SVM prediction": 2
})

# Save to file
output.to_csv("stroke_probabilities.csv", index=False)

print("\nOutput saved")
print(output.head())


Generating stroke probabilities for test data

Output saved
   Participant ID  Logistic regression prediction  Random forest prediction  \
0             101                            0.37                      0.49   
1             102                            0.49                      0.57   
2             103                            0.34                      0.26   
3             104                            0.11                      0.08   
4             105                            0.69                      0.71   

   SVM prediction  
0            0.65  
1            0.69  
2            0.22  
3            0.04  
4            0.78  
