In [11]:
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.max_columns', None)

df = pd.read_excel("../../datasets/raw/bone-marrow_raw.xlsx")

In [12]:
from sklearn.model_selection import train_test_split

targets = df[["survival_time", "survival_status"]]
targets = targets.rename(columns={"survival_status": "is_dead"})

X = df.loc[:, : "stem_cell_source"].copy()
y_clf = targets["is_dead"]
y_reg = targets["survival_time"]


X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(X, y_reg, y_clf, test_size=0.15, random_state=42, stratify=y_clf)

In [None]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FunctionTransformer
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from imblearn.over_sampling import RandomOverSampler

bool_cols = [
    "donor_age_below_35", 
    "donor_CMV", 
    "recipient_age_below_10", 
    "recipient_gender", 
    "recipient_CMV", 
    "disease_group", 
    "gender_match", 
    "ABO_match", 
    "HLA_mismatch", 
    "risk_group",
    "stem_cell_source"
]

cat_cols = [
    "CMV_status",
    "disease",
    "HLA_group_1",
    "recipient_age_int",
]

def feature_engineering(X):
    X = X.copy()

    # Age gap
    X["age_gap"] = (X["donor_age"] - X["recipient_age"]).abs()

    # Donor age bins
    X["donor_age_bin"] = pd.cut(
        X["donor_age"],
        bins=[0, 18, 40, 60, 100],
        labels=False
    )

    # Recipient age bins
    X["recipient_age_bin"] = pd.cut(
        X["recipient_age"],
        bins=[0, 2, 5, 7, 10, 18, 22],
        labels=False
    )

    return X[["donor_age_bin", 
              "recipient_age_bin", 
              "age_gap"]]

extra_cols = Pipeline([
    ("feature_engineering", FunctionTransformer(feature_engineering, feature_names_out=lambda self, input_features: ["donor_age_bin", "recipient_age_bin", "age_gap"])),
    ("imputer", SimpleImputer(strategy="median", add_indicator=True))
])


def parse_hla_match(X):
    s = X.iloc[:, 0]
    return (
        s
        .astype(str)
        .str.split("/", expand=True)[0]
        .astype(float)
        .to_frame()
    )


hla_pipeline = Pipeline([
    ("parser", FunctionTransformer(parse_hla_match, feature_names_out="one-to-one")),
    ("imputer", SimpleImputer(strategy="median", add_indicator=True))
])

bool_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent", add_indicator=True)),
    ("encoder", OneHotEncoder(drop="if_binary"))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent", add_indicator=True)),
    ("one_hot", OneHotEncoder())
])

columns_to_drop = ["donor_ABO", 
                   "recipient_ABO", 
                   "recipient_rh", 
                   "recipient_gender",
                   "recipient_age_int",
                    "recipient_age_below_10",
                   "donor_age_below_35",
                   "recipient_CMV",
                   "donor_CMV",
                   "disease_group"] 

preprocessor = ColumnTransformer(
    transformers=[
        ("extra_cols", extra_cols, ["donor_age", "recipient_age"]),
        ("column_dropper", "drop", columns_to_drop),
        ("hla", hla_pipeline, ["HLA_match"]),
        ("bool", bool_pipeline, bool_cols),
        ("one_hot", cat_pipeline, cat_cols)
    ],
    remainder=SimpleImputer(strategy="median", add_indicator=True)
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

In [None]:
from collections import Counter
from sklearn.calibration import cross_val_predict
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold

pipeline_smote = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("model", RandomForestClassifier())
])

param_grid = {
    'model__n_estimators': [25, 50, 75, 100, 150],
    'model__max_depth': [None, 3, 5, 7],
    'model__max_features': ["sqrt", 5, 10],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_gs_rf(pipeline, param_grid, X_train, y_clf_train):
    gs_rf = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1, scoring='f1_weighted', verbose=True)
    gs_rf.fit(X_train, y_clf_train)
    return gs_rf

In [22]:
gs_rf = run_gs_rf(pipeline, param_grid, X_train, y_clf_train)

y_pred_cv = cross_val_predict(gs_rf.best_estimator_, X_train, y_clf_train, cv=cv, verbose=10, n_jobs=-1)
print(classification_report(y_clf_train, y_pred_cv))
print(gs_rf.best_params_)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


              precision    recall  f1-score   support

           0       0.64      0.73      0.68        86
           1       0.61      0.50      0.55        72

    accuracy                           0.63       158
   macro avg       0.62      0.62      0.62       158
weighted avg       0.62      0.63      0.62       158

{'model__max_depth': None, 'model__max_features': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


In [23]:
gs_rf = run_gs_rf(pipeline_smote, param_grid, X_train, y_clf_train)

y_pred_cv = cross_val_predict(gs_rf.best_estimator_, X_train, y_clf_train, cv=cv, verbose=10, n_jobs=-1)
print(classification_report(y_clf_train, y_pred_cv))
print(gs_rf.best_params_)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


              precision    recall  f1-score   support

           0       0.61      0.72      0.66        86
           1       0.57      0.44      0.50        72

    accuracy                           0.59       158
   macro avg       0.59      0.58      0.58       158
weighted avg       0.59      0.59      0.59       158

{'model__max_depth': None, 'model__max_features': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 150}


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.3s finished
