In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from xgboost import XGBClassifier

pd.set_option("display.max_columns", None)

## 1) Load Data

In [2]:
cwd = Path().resolve()

if (cwd / "notebooks").exists():
    ROOT = cwd
elif (cwd == "notebooks"):
    ROOT = cwd.parent
else:
    ROOT = cwd

train = pd.read_csv((ROOT / "data/train.csv"))
test = pd.read_csv((ROOT / "data/test.csv"))

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2) Feature Engineering

In [3]:
def engineer(df):
    out = df.copy()

    # Title
    out["Title"] = out["Name"].str.extract(r",\s*([^\.]+)\.")
    out["Title"] = out["Title"].replace({
        "Mlle":"Miss", "Ms":"Miss", "Mme":"Mrs",
        "Lady":"Rare","the Countess":"Rare","Capt":"Rare","Col":"Rare",
        "Don":"Rare","Dr":"Rare","Major":"Rare","Rev":"Rare",
        "Sir":"Rare","Jonkheer":"Rare"
    })

    # Family
    out["FamilySize"] = out["SibSp"] + out["Parch"] + 1
    out["IsAlone"] = (out["FamilySize"] == 1).astype(int)

    # Cabin
    out["CabinFlag"] = out["Cabin"].notna().astype(int)

    # Deck - Omitted
    # out["Deck"] = out["Cabin"].str[0]
    # out["Deck"] = out["Deck"].replace({
        # "G": "Rare", "T": "Rare"
    # })
    # out["Deck"] = out["Deck"].fillna("Unknown")

    # Drop high-card cols
    drop_cols = ["Name", "Ticket", "Cabin"]
    out = out.drop(columns=[c for c in drop_cols if c in out.columns], errors="ignore")

    return out

train_fe = engineer(train)
test_fe = engineer(test)

## 3) Split

In [4]:
y = train_fe.Survived
X = train_fe.drop(columns=["Survived"])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

## 3) Scoring Format

In [5]:

# def evaluate_model(name, model, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid):
#     # Train predictions
#     y_train_pred = model.predict(X_train)
#     y_train_proba = model.predict_proba(X_train)[:, 1]

#     # Validation predictions
#     y_val_pred = model.predict(X_valid)
#     y_val_proba = model.predict_proba(X_valid)[:, 1]

#     print(f"\n=== {name} ===")

#     print("\nTraining performance:")
#     print(f"  Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
#     print(f"  ROC AUC : {roc_auc_score(y_train, y_train_proba):.4f}")

#     print("\nValidation performance:")
#     print(f"  Accuracy: {accuracy_score(y_valid, y_val_pred):.4f}")
#     print(f"  ROC AUC : {roc_auc_score(y_valid, y_val_proba):.4f}")

#     print("\nClassification Report:")
#     print(classification_report(y_valid, y_val_pred))

#     print("Confusion Matrix:")
#     print(confusion_matrix(y_valid, y_val_pred))

In [6]:

# def evaluate_model(name, model, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, cv_folds=0):
#     # Train predictions
#     y_train_pred = model.predict(X_train)
#     y_train_proba = model.predict_proba(X_train)[:, 1]

#     # Validation predictions
#     y_val_pred = model.predict(X_valid)
#     y_val_proba = model.predict_proba(X_valid)[:, 1]

#     print(f"\n=== {name} ===")

#     print("\nTraining performance:")
#     print(f"  Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
#     print(f"  ROC AUC : {roc_auc_score(y_train, y_train_proba):.4f}")

#     print("\nValidation performance:")
#     print(f"  Accuracy: {accuracy_score(y_valid, y_val_pred):.4f}")
#     print(f"  ROC AUC : {roc_auc_score(y_valid, y_val_proba):.4f}")

#     print("\nClassification Report:")
#     print(classification_report(y_valid, y_val_pred))

#     print("Confusion Matrix:")
#     print(confusion_matrix(y_valid, y_val_pred))

## 3) Split

In [7]:

def evaluate_model(name, model, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, cv_folds=0):

    if cv_folds <= 1:

        print(f"\n=== {name} ===")

        # ---- Training performance ----
        y_train_pred = model.predict(X_train)
        y_train_proba = model.predict_proba(X_train)[:, 1]

        print("\nTraining performance:")
        print(f"  Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
        print(f"  ROC AUC : {roc_auc_score(y_train, y_train_proba):.4f}")


        # ---- Validation performance ----
        y_val_pred = model.predict(X_valid)
        y_val_proba = model.predict_proba(X_valid)[:, 1]

        print("\nValidation performance:")
        print(f"  Accuracy: {accuracy_score(y_valid, y_val_pred):.4f}")
        print(f"  ROC AUC : {roc_auc_score(y_valid, y_val_proba):.4f}")

        print("\nClassification Report:")
        print(classification_report(y_valid, y_val_pred))

        print("Confusion Matrix:")
        print(confusion_matrix(y_valid, y_val_pred))


    # ---- Optional Cross-Validation ----
    else:
        print(f"\nCross-Validation ({cv_folds}-fold) ROC AUC:")

        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

        cv_scores = cross_val_score(
            model,
            X,
            y,
            cv=cv,
            scoring="roc_auc",
            n_jobs=-1
        )

        print("  Scores:", cv_scores)
        print(f"  Mean:   {cv_scores.mean():.4f}")
        print(f"  Std:    {cv_scores.std():.4f}")

## 4) Preprocessing

In [8]:
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(exclude="number").columns.tolist()

numeric_tf = SimpleImputer(strategy="median")
categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols)
    ]
)

## 5) Baseline model (Logistic Regression)

In [9]:
clf = LogisticRegression(max_iter=200, n_jobs=-1)
pipe = Pipeline(steps=[
    ("pre", pre),
    ("clf", clf)
])
pipe.fit(X_train, y_train)

# pred_valid = pipe.predict(X_valid)
# proba_valid = pipe.predict_proba(X_valid)[:, 1]

# print(f"VALID accuracy: {accuracy_score(y_valid, pred_valid):.4f}")
# print(f"VALID ROC AUC: {roc_auc_score(y_valid, proba_valid):.4f}")
# print(classification_report(y_valid, pred_valid, digits=4))
# confusion_matrix(y_valid, pred_valid)

evaluate_model("Logistic Regression (Baseline)", pipe)



=== Logistic Regression (Baseline) ===

Training performance:
  Accuracy: 0.8371
  ROC AUC : 0.8765

Validation performance:
  Accuracy: 0.8324
  ROC AUC : 0.8646

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       110
           1       0.80      0.75      0.78        69

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix:
[[97 13]
 [17 52]]


## 6) 5-fold Cross-Validation

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy")
cv_scores.mean().round(4), cv_scores.std(),round(4), cv_scores

(np.float64(0.8283),
 np.float64(0.007958111654076347),
 4,
 array([0.84357542, 0.8258427 , 0.82022472, 0.8258427 , 0.8258427 ]))

## 7) Random Forest Baseline
- RF not well suited for this dataset

In [11]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_pipe = Pipeline([
    ("pre", pre),
    ("clf", rf_clf)
])

rf_pipe.fit(X_train, y_train)

# rf_pred = rf_pipe.predict(X_valid)
# rf_proba = rf_pipe.predict_proba(X_valid)[:, 1]

# print("Random Forest - Accuracy:", accuracy_score(y_valid, rf_pred))
# print("Random Forest - ROC AUC:", roc_auc_score(y_valid, rf_proba))
# print(classification_report(y_valid, rf_pred))
# print("Confusion matrix:\n", confusion_matrix(y_valid, rf_pred))

evaluate_model("Random Forest (Baseline)", rf_pipe)


=== Random Forest (Baseline) ===

Training performance:
  Accuracy: 1.0000
  ROC AUC : 1.0000

Validation performance:
  Accuracy: 0.8156
  ROC AUC : 0.8470

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.72      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179

Confusion Matrix:
[[96 14]
 [19 50]]


## 8) Random Forest Tuning (RandomizedSearchCV)

In [12]:
rf_clf_tuned = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_pipe_tuned = Pipeline([
    ("pre", pre),
    ("clf", rf_clf_tuned)
])

param_dist = {
    "clf__n_estimators": [100, 150, 200, 250],
    "clf__max_depth": [None, 3, 5, 7, 10],
    "clf__min_samples_split": [2, 3, 5],
    "clf__min_samples_leaf": [1, 2, 3, 4],
    "clf__max_features": ["sqrt", "log2", 0.5],
    "clf__bootstrap": [True, False]
}

search = RandomizedSearchCV(
    rf_pipe_tuned,
    param_distributions=param_dist,
    n_iter=25,
    scoring="roc_auc",
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=1,
)

search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


{'clf__n_estimators': 150,
 'clf__min_samples_split': 3,
 'clf__min_samples_leaf': 2,
 'clf__max_features': 'log2',
 'clf__max_depth': None,
 'clf__bootstrap': True}

#### No meaningful improvement from tuning
#### Model is slightly overfitting the training data

In [13]:
best_rf = search.best_estimator_

best_rf_train_pred = best_rf.predict(X_train)
best_rf_train_proba = best_rf.predict_proba(X_train)[:, 1]
best_rf_pred = best_rf.predict(X_valid)
best_rf_proba = best_rf.predict_proba(X_valid)[:, 1]

# print("Random Forest (Tuned):\n")
# print("Training accuracy:", accuracy_score(y_train, best_rf_train_pred))
# print("Training ROC AUC:", roc_auc_score(y_train, best_rf_train_proba), "\n")

# print("Validation accuracy:", accuracy_score(y_valid, best_rf_pred))
# print("Validation ROC AUC:", roc_auc_score(y_valid, best_rf_proba), "\n")
# print(classification_report(y_valid, best_rf_pred))
# print("Confusion matrix:\n", confusion_matrix(y_valid, best_rf_pred))

evaluate_model("Random Forest (Tuned)", best_rf)


=== Random Forest (Tuned) ===

Training performance:
  Accuracy: 0.9213
  ROC AUC : 0.9860

Validation performance:
  Accuracy: 0.8101
  ROC AUC : 0.8511

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

Confusion Matrix:
[[96 14]
 [20 49]]


## 9) XGBoost Baseline

In [37]:
xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

xgb_pipe = Pipeline([
    ("pre", pre),
    ("clf", xgb_clf)
])

xgb_pipe.fit(X_train, y_train)

# xgb_pred = xgb_pipe.predict(X_valid)
# xgb_proba = xgb_pipe.predict_proba(X_valid)[:, 1]

# print("XGB Accuracy:", accuracy_score(y_valid, xgb_pred))
# print("XGB ROC AUC:", roc_auc_score(y_valid, xgb_proba))
# print(classification_report(y_valid, xgb_pred))
# print("Confusion matrix:\n", confusion_matrix(y_valid, xgb_pred))

evaluate_model("XGBoost (Baseline)", xgb_pipe)


=== XGBoost (Baseline) ===

Training performance:
  Accuracy: 0.9129
  ROC AUC : 0.9750

Validation performance:
  Accuracy: 0.7933
  ROC AUC : 0.8493

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.70      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

Confusion Matrix:
[[94 16]
 [21 48]]


## 10) XGBoost Tuning

In [15]:
xgb_clf_tuned = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

xgb_pipe_tuned = Pipeline([
    ("pre", pre),
    ("clf", xgb_clf_tuned)
])

xgb_param_dist = {
    "clf__n_estimators": [100, 200, 300, 400],
    "clf__max_depth": [2, 3, 4],
    "clf__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "clf__subsample": [0.7, 0.9, 1.0],
    "clf__colsample_bytree": [0.7, 0.9, 1.0],
    "clf__min_child_weight": [1, 3, 5],
    "clf__gamma": [0, 0.5, 1],
    "clf__reg_lambda": [1, 2, 5],
    "clf__reg_alpha": [0, 0.1, 0.5],
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

xgb_search = RandomizedSearchCV(
    estimator=xgb_pipe_tuned,
    param_distributions=xgb_param_dist,
    n_iter=25,
    scoring="roc_auc",
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

xgb_search.fit(X_train, y_train)
xgb_search.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


{'clf__subsample': 1.0,
 'clf__reg_lambda': 5,
 'clf__reg_alpha': 0,
 'clf__n_estimators': 300,
 'clf__min_child_weight': 1,
 'clf__max_depth': 3,
 'clf__learning_rate': 0.05,
 'clf__gamma': 0,
 'clf__colsample_bytree': 1.0}

In [16]:
xgb_search.best_score_

np.float64(0.8824363277498073)

In [None]:
best_xgb = xgb_search.best_estimator_

best_xgb_pred = best_xgb.predict(X_valid)
best_xgb_proba = best_xgb.predict_proba(X_valid)[:, 1]

evaluate_model("XGBoost (Tuned)", best_xgb)


=== XGBoost (Tuned) ===

Training performance:
  Accuracy: 0.9045
  ROC AUC : 0.9539

Validation performance:
  Accuracy: 0.8156
  ROC AUC : 0.8387

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       110
           1       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179

Confusion Matrix:
[[97 13]
 [20 49]]

Cross-Validation (10-fold) ROC AUC:
  Scores: [0.91116883 0.87807487 0.84358289 0.91871658 0.79411765 0.87807487
 0.88983957 0.85320856 0.88930481 0.81798942]
  Mean:   0.8674
  Std:    0.0379


In [21]:
evaluate_model("XGBoost (Tuned)", best_xgb, cv_folds=10)


Cross-Validation (10-fold) ROC AUC:
  Scores: [0.91116883 0.87807487 0.84358289 0.91871658 0.79411765 0.87807487
 0.88983957 0.85320856 0.88930481 0.81798942]
  Mean:   0.8674
  Std:    0.0379


#### Adding more regularization to generalize better

In [18]:
xgb_clf_reg = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    min_child_weight=5,   # up from 1
    subsample=1,        # down from 1.0
    colsample_bytree=1, # down from 1.0
    reg_lambda=5,
    reg_alpha=6,        # add a bit of L1
    gamma=0,
    n_jobs=-1,
    random_state=42,
)

xgb_reg_pipe = Pipeline([
    ("pre", pre),
    ("clf", xgb_clf_reg)
])

xgb_reg_pipe.fit(X_train, y_train)

evaluate_model("XGBoost (Tuned + Reg)", xgb_reg_pipe)
evaluate_model("XGBoost (Tuned + Reg)", xgb_reg_pipe, cv_folds=5)


=== XGBoost (Tuned + Reg) ===

Training performance:
  Accuracy: 0.8497
  ROC AUC : 0.9035

Validation performance:
  Accuracy: 0.8492
  ROC AUC : 0.8506

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       110
           1       0.82      0.78      0.80        69

    accuracy                           0.85       179
   macro avg       0.84      0.84      0.84       179
weighted avg       0.85      0.85      0.85       179

Confusion Matrix:
[[98 12]
 [15 54]]

Cross-Validation (5-fold) ROC AUC:
  Scores: [0.89591568 0.87586898 0.84331551 0.85387701 0.87920489]
  Mean:   0.8696
  Std:    0.0188


In [19]:
xgb_clf_reg = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    min_child_weight=4,   # up from 1
    subsample=1,        # down from 1.0
    colsample_bytree=1, # down from 1.0
    reg_lambda=5,
    reg_alpha=6,        # add a bit of L1
    gamma=0,
    n_jobs=-1,
    random_state=42,
)

xgb_reg_pipe = Pipeline([
    ("pre", pre),
    ("clf", xgb_clf_reg)
])

xgb_reg_pipe.fit(X_train, y_train)

# evaluate_model("XGBoost (Tuned + Reg)", xgb_reg_pipe)
evaluate_model("XGBoost (Tuned + Reg)", xgb_reg_pipe, cv_folds=5)


Cross-Validation (5-fold) ROC AUC:
  Scores: [0.89380764 0.87473262 0.84338235 0.85006684 0.88432389]
  Mean:   0.8693
  Std:    0.0195


## Early Stopping

In [34]:
# Split train into train_full + valid for early stopping
X_train_full, X_valid, y_train_full, y_valid = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train,
)

# Fit the preprocessor on train_full only
pre.fit(X_train_full)
X_train_full_proc = pre.transform(X_train_full)
X_valid_proc = pre.transform(X_valid)

# Strip 'clf__' from best_params_ to feed XGBClassifier
best_xgb_params = {
    k.replace("clf__", ""): v
    for k, v in search.best_params_.items()
}

xgb_early = XGBClassifier(
    n_estimators=2000,          # high, rely on early stopping
    tree_method="hist",
    eval_metric="auc",
    early_stopping_rounds=50,   # set in constructor (XGBoost 2.x)
    random_state=42,
    n_jobs=-1,
    **best_xgb_params,
)

xgb_early.fit(
    X_train_full_proc,
    y_train_full,
    eval_set=[(X_valid_proc, y_valid)],
    verbose=False,
)

print("Best iteration:", xgb_early.best_iteration)
print("Best AUC on eval_set:", xgb_early.best_score)


Best iteration: 84
Best AUC on eval_set: 0.8729338842975206
