In [8]:

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix,
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipe

train_df = pd.read_csv("train_split_80.csv")
test_df  = pd.read_csv("test_split_20.csv")

y_train = train_df["three_year_status"].astype(int)
X_train = train_df.drop(columns=["patient_id", "three_year_status", 
                                 "tobacco_smoking_history_indicator", "alcohol_history_documented"])

y_test  = test_df["three_year_status"].astype(int)
X_test  = test_df.drop(columns=["patient_id", "three_year_status", 
                                 "tobacco_smoking_history_indicator", "alcohol_history_documented"])

num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(exclude=["int64", "float64"]).columns

prep = ColumnTransformer(
    [
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc" , StandardScaler())
        ]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh" , OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ]
)

svm = SVC(
    kernel="rbf",
    C=1.0,
    gamma="scale",
    probability=True,
    class_weight="balanced",
    random_state=42,
)

model = ImbPipe([
    ("prep", prep),
    ("smote", SMOTE(random_state=42)),
    ("svm", svm)
])

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int) 

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp) if (tn + fp) else 0.0

print("Test-set metrics ")
print(f"Precision   : {precision_score(y_test, y_pred):.3f}")
print(f"Recall      : {recall_score(y_test, y_pred):.3f}")     # sensitivity
print(f"Specificity : {specificity:.3f}")
print(f"F1 score    : {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC     : {roc_auc_score(y_test, y_prob):.3f}")


Test-set metrics 
Precision   : 0.583
Recall      : 0.389
Specificity : 0.800
F1 score    : 0.467
ROC-AUC     : 0.656


In [None]:

from sklearn.linear_model import LogisticRegression


# Elastic-net logistic regression
logreg = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    l1_ratio=0.5,          # 0=l2, 1=l1; tweak as needed
    max_iter=5000,
    class_weight="balanced",
    random_state=42,
)

model = ImbPipe([
    ("prep",   prep),
    ("smote",  SMOTE(random_state=42)),
    ("logreg", logreg),
])

logreg = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    l1_ratio=0.4,          # 0=l2, 1=l1; tweak as needed
    max_iter=5000,
    class_weight="balanced",
    random_state=42,
)

model = ImbPipe([
    ("prep",   prep),
    ("smote",  SMOTE(random_state=42)),
    ("logreg", logreg),
])

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)   # default threshold

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp) if (tn + fp) else 0.0

print("Test-set metrics")
print(f"Precision   : {precision_score(y_test, y_pred):.3f}")
print(f"Recall      : {recall_score(y_test, y_pred):.3f}")     # sensitivity
print(f"Specificity : {specificity:.3f}")
print(f"F1 score    : {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC     : {roc_auc_score(y_test, y_prob):.3f}")


Test-set metrics
Precision   : 0.500
Recall      : 0.611
Specificity : 0.560
F1 score    : 0.550
ROC-AUC     : 0.680
