In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from xgboost import XGBClassifier
from scipy.stats import randint, uniform, loguniform

pd.set_option("display.max_columns", None)

## 1) Load Data

In [2]:
cwd = Path().resolve()

if (cwd / "notebooks").exists():
    ROOT = cwd
elif (cwd == "notebooks"):
    ROOT = cwd.parent
else:
    ROOT = cwd

train = pd.read_csv((ROOT / "data/train.csv"))
test = pd.read_csv((ROOT / "data/test.csv"))

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2) Feature Engineering

In [3]:
def engineer(df):
    out = df.copy()

    # Title
    out["Title"] = out["Name"].str.extract(r",\s*([^\.]+)\.")
    out["Title"] = out["Title"].replace({
        "Mlle":"Miss", "Ms":"Miss", "Mme":"Mrs",
        "Lady":"Rare","the Countess":"Rare","Capt":"Rare","Col":"Rare",
        "Don":"Rare","Dr":"Rare","Major":"Rare","Rev":"Rare",
        "Sir":"Rare","Jonkheer":"Rare"
    })

    # Family
    out["FamilySize"] = out["SibSp"] + out["Parch"] + 1
    out["IsAlone"] = (out["FamilySize"] == 1).astype(int)

    # Cabin
    out["CabinFlag"] = out["Cabin"].notna().astype(int)

    # Deck - Omitted
    # out["Deck"] = out["Cabin"].str[0]
    # out["Deck"] = out["Deck"].replace({
        # "G": "Rare", "T": "Rare"
    # })
    # out["Deck"] = out["Deck"].fillna("Unknown")

    # Drop high-card cols
    drop_cols = ["Name", "Ticket", "Cabin"]
    out = out.drop(columns=[c for c in drop_cols if c in out.columns], errors="ignore")

    return out

train_fe = engineer(train)
test_fe = engineer(test)

## 3) Split

In [4]:
y = train_fe.Survived
X = train_fe.drop(columns=["Survived"])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

## 4) Scoring

In [5]:

def evaluate_model(name, model, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid):

    print(f"\n=== {name} ===")

    y_train_pred = model.predict(X_train)
    y_train_proba = model.predict_proba(X_train)[:, 1]

    print("\nTraining performance:")
    print(f"  Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"  ROC AUC : {roc_auc_score(y_train, y_train_proba):.4f}")

    y_val_pred = model.predict(X_valid)
    y_val_proba = model.predict_proba(X_valid)[:, 1]

    print("\nValidation performance:")
    print(f"  Accuracy: {accuracy_score(y_valid, y_val_pred):.4f}")
    print(f"  ROC AUC : {roc_auc_score(y_valid, y_val_proba):.4f}")

    print("\nClassification Report:")
    print(classification_report(y_valid, y_val_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_valid, y_val_pred))

In [6]:
def cross_validate(model, cv_folds=5):

    print(f"\nCross-Validation ({cv_folds}-fold) ROC AUC:")

    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    cv_scores = cross_val_score(
        model,
        X,
        y,
        cv=cv,
        scoring="roc_auc",
        n_jobs=-1
    )

    print("  Scores:", cv_scores)
    print(f"  Mean:   {cv_scores.mean():.4f}")
    print(f"  Std:    {cv_scores.std():.4f}")

## 5) Preprocessing

In [7]:
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(exclude="number").columns.tolist()

numeric_tf = SimpleImputer(strategy="median")
categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat", OneHotEncoder(handle_unknown="ignore"))
])

pre = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols)
    ]
)

## 6) Logistic Regression Baseline

In [8]:
clf = LogisticRegression(max_iter=200, n_jobs=-1)
pipe = Pipeline(steps=[
    ("pre", pre),
    ("clf", clf)
])
pipe.fit(X_train, y_train)

evaluate_model("Logistic Regression (Baseline)", pipe)
cross_validate(pipe)



=== Logistic Regression (Baseline) ===

Training performance:
  Accuracy: 0.8371
  ROC AUC : 0.8765

Validation performance:
  Accuracy: 0.8324
  ROC AUC : 0.8646

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       110
           1       0.80      0.75      0.78        69

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix:
[[97 13]
 [17 52]]

Cross-Validation (5-fold) ROC AUC:
  Scores: [0.89538867 0.86804813 0.8473262  0.85614973 0.88525462]
  Mean:   0.8704
  Std:    0.0178


## 7) Logistic Regression Tuning (GridSearchCV)

In [9]:
lr_tuned_pipe = Pipeline([
    ('pre', pre),
    ('lr_clf', LogisticRegression(max_iter=200))
])

param_grid = {
    'lr_clf__C': [0.01, 0.1, 1, 10, 100],
    'lr_clf__penalty': ['l1', 'l2'],
    'lr_clf__solver': ['liblinear']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=lr_tuned_pipe,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=cv,
    verbose=1
)

grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'lr_clf__C': 1, 'lr_clf__penalty': 'l2', 'lr_clf__solver': 'liblinear'}

In [10]:
best_lr = grid_search.best_estimator_

evaluate_model("Logistic Regression (Tuned)", best_lr)
cross_validate(best_lr)


=== Logistic Regression (Tuned) ===

Training performance:
  Accuracy: 0.8357
  ROC AUC : 0.8775

Validation performance:
  Accuracy: 0.8324
  ROC AUC : 0.8688

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       110
           1       0.80      0.75      0.78        69

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix:
[[97 13]
 [17 52]]

Cross-Validation (5-fold) ROC AUC:
  Scores: [0.89907773 0.86697861 0.84826203 0.8578877  0.89017418]
  Mean:   0.8725
  Std:    0.0192


## 8) Random Forest Baseline
- The model is overfitting the training data

In [11]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_pipe = Pipeline([
    ("pre", pre),
    ("clf", rf_clf)
])

rf_pipe.fit(X_train, y_train)


evaluate_model("Random Forest (Baseline)", rf_pipe)
cross_validate(rf_pipe)


=== Random Forest (Baseline) ===

Training performance:
  Accuracy: 1.0000
  ROC AUC : 1.0000

Validation performance:
  Accuracy: 0.8156
  ROC AUC : 0.8470

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.72      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179

Confusion Matrix:
[[96 14]
 [19 50]]

Cross-Validation (5-fold) ROC AUC:
  Scores: [0.89848485 0.8618984  0.83863636 0.86824866 0.89529318]
  Mean:   0.8725
  Std:    0.0222


## 9) Random Forest Tuning (RandomizedSearchCV)

In [12]:
rf_clf_tuned = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_pipe_tuned = Pipeline([
    ("pre", pre),
    ("clf", rf_clf_tuned)
])

"""
param_dist = {
    "clf__n_estimators": [100, 150, 200, 250],
    "clf__max_depth": [None, 3, 5, 7, 10],
    "clf__min_samples_split": [2, 3, 5],
    "clf__min_samples_leaf": [1, 2, 3, 4],
    "clf__max_features": ["sqrt", "log2", 0.5],
    "clf__bootstrap": [True, False]
}
 """

param_dist = {
    "clf__n_estimators": randint(50, 500),
    "clf__max_depth": randint(3, 15),
    "clf__min_samples_split": randint(2, 20),
    "clf__min_samples_leaf": randint(1, 10),
    # "clf__max_features": uniform(0.3, 0.5),
    "clf__max_features": ["sqrt", "log2", 0.5],
    'clf__criterion': ['gini', 'entropy'],
    # 'clf__max_leaf_nodes': [None] + list(range(10, 100, 10)),
    # 'clf__min_impurity_decrease': uniform(0.0, 0.1),
    'clf__class_weight': ['balanced', 'balanced_subsample', None],
    # "clf__bootstrap": [True, False]
    "clf__bootstrap": [True]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    rf_pipe_tuned,
    param_distributions=param_dist,
    n_iter=60,
    scoring="roc_auc",
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1,
)

search.fit(X_train, y_train)
search.best_params_

Fitting 5 folds for each of 60 candidates, totalling 300 fits


{'clf__bootstrap': True,
 'clf__class_weight': 'balanced_subsample',
 'clf__criterion': 'entropy',
 'clf__max_depth': 14,
 'clf__max_features': 'sqrt',
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 13,
 'clf__n_estimators': 363}

In [13]:
best_rf = search.best_estimator_

evaluate_model("Random Forest (Tuned)", best_rf)
cross_validate(best_rf)


=== Random Forest (Tuned) ===

Training performance:
  Accuracy: 0.8961
  ROC AUC : 0.9712

Validation performance:
  Accuracy: 0.8268
  ROC AUC : 0.8490

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.86       110
           1       0.77      0.78      0.78        69

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix:
[[94 16]
 [15 54]]

Cross-Validation (5-fold) ROC AUC:
  Scores: [0.91317523 0.8776738  0.8394385  0.86323529 0.88791384]
  Mean:   0.8763
  Std:    0.0246


## 10) XGBoost Baseline

In [14]:
xgb_clf = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

xgb_pipe = Pipeline([
    ("pre", pre),
    ("clf", xgb_clf)
])

xgb_pipe.fit(X_train, y_train)

evaluate_model("XGBoost (Baseline)", xgb_pipe)


=== XGBoost (Baseline) ===

Training performance:
  Accuracy: 0.9129
  ROC AUC : 0.9750

Validation performance:
  Accuracy: 0.7933
  ROC AUC : 0.8493

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.70      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

Confusion Matrix:
[[94 16]
 [21 48]]


## 11) XGBoost Tuning

In [15]:
xgb_clf_tuned = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

xgb_pipe_tuned = Pipeline([
    ("pre", pre),
    ("clf", xgb_clf_tuned)
])

"""
xgb_param_dist = {
    "clf__n_estimators": [100, 200, 300, 400],
    "clf__max_depth": [2, 3, 4],
    "clf__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "clf__subsample": [0.7, 0.9, 1.0],
    "clf__colsample_bytree": [0.7, 0.9, 1.0],
    "clf__min_child_weight": [1, 3, 5],
    "clf__gamma": [0, 0.5, 1],
    "clf__reg_lambda": [1, 2, 5],
    "clf__reg_alpha": [0, 0.1, 0.5],
}
 """

xgb_param_dist = {
    "clf__n_estimators": randint(200, 800),
    "clf__max_depth": randint(2, 6),
    "clf__learning_rate": loguniform(0.01, 0.2),  # ~0.01–0.2
    "clf__subsample": uniform(0.6, 0.4),  # 0.6–1.0
    "clf__colsample_bytree": uniform(0.6, 0.4),  # 0.6–1.0
    "clf__min_child_weight": randint(1, 8),
    "clf__gamma": uniform(0.0, 0.5),
    "clf__reg_lambda": loguniform(1e-2, 10),  # L2
    "clf__reg_alpha": loguniform(1e-4, 1),  # L1
}

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

xgb_search = RandomizedSearchCV(
    estimator=xgb_pipe_tuned,
    param_distributions=xgb_param_dist,
    n_iter=100,
    scoring="roc_auc",
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

xgb_search.fit(X_train, y_train)
xgb_search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


{'clf__colsample_bytree': np.float64(0.9442922333025374),
 'clf__gamma': np.float64(0.0034760652655953517),
 'clf__learning_rate': np.float64(0.046184639800512954),
 'clf__max_depth': 2,
 'clf__min_child_weight': 6,
 'clf__n_estimators': 424,
 'clf__reg_alpha': np.float64(0.00030162092627967796),
 'clf__reg_lambda': np.float64(0.10300196600986772),
 'clf__subsample': np.float64(0.9771638815650077)}

In [16]:
xgb_search.best_score_

np.float64(0.8851101796438787)

In [17]:
best_xgb = xgb_search.best_estimator_

best_xgb_pred = best_xgb.predict(X_valid)
best_xgb_proba = best_xgb.predict_proba(X_valid)[:, 1]

evaluate_model("XGBoost (Tuned)", best_xgb)
cross_validate(best_xgb)


=== XGBoost (Tuned) ===

Training performance:
  Accuracy: 0.8778
  ROC AUC : 0.9400

Validation performance:
  Accuracy: 0.8156
  ROC AUC : 0.8397

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       110
           1       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179

Confusion Matrix:
[[97 13]
 [20 49]]

Cross-Validation (5-fold) ROC AUC:
  Scores: [0.92108037 0.87032086 0.84665775 0.86530749 0.86983114]
  Mean:   0.8746
  Std:    0.0248


## Early Stopping

In [18]:

pipeline = Pipeline([
    ('pre', pre),
    ('xgboost', XGBClassifier(
        n_estimators=1000,  # Set high
        early_stopping_rounds=20,
        eval_metric='logloss'
    ))
])

param_distributions = {
    'xgboost__learning_rate': uniform(0.01, 0.3),
    'xgboost__max_depth': randint(3, 10),
    'xgboost__subsample': uniform(0.6, 0.4),
    'xgboost__colsample_bytree': uniform(0.6, 0.4),
    'xgboost__min_child_weight': randint(1, 10),
    'xgboost__reg_alpha': uniform(0, 1),
    'xgboost__reg_lambda': uniform(0, 2)
}

X_val_transformed = pipeline[:-1].fit_transform(X_valid)

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter=50,
    cv=3,
    scoring='neg_log_loss',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(
    X_train, y_train,
    xgboost__eval_set=[(X_val_transformed, y_valid)],
    xgboost__verbose=False
)

best_xgb_model = random_search.best_estimator_
print(f"Best params: {random_search.best_params_}")
print(f"Best trees used: {best_xgb_model.named_steps['xgboost'].best_iteration}")

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best params: {'xgboost__colsample_bytree': np.float64(0.881207583558071), 'xgboost__learning_rate': np.float64(0.11908888071378819), 'xgboost__max_depth': 3, 'xgboost__min_child_weight': 4, 'xgboost__reg_alpha': np.float64(0.2468760628386012), 'xgboost__reg_lambda': np.float64(1.3926085456795767), 'xgboost__subsample': np.float64(0.8849082359697769)}
Best trees used: 15
