# Titanic Competition - Deep Learning


In [1]:
# Setup plotting
import matplotlib.pyplot as plt

plt.style.use("seaborn-v0_8-whitegrid")
# Set Matplotlib defaults
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=18,
    titlepad=10,
)
plt.rc("animation", html="html5")

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold

from statistics import mean

import os

from utils import preprocess_data

## Preprocessing


In [2]:
# Read the data
train_data = pd.read_csv("../input/train.csv").set_index("PassengerId")
test_data = pd.read_csv("../input/test.csv").set_index("PassengerId")

X, y, X_test = preprocess_data(
    train_data,
    test_data,
    label_value="Survived",
)

## Define the model


In [3]:
import xgboost as xgb

model = xgb.XGBClassifier()

## Start testing


Key params for tuning:

- Learning Rate (learning_rate)
  - Definition: Controls the step size during each iteration of boosting.
  - Effect: A smaller learning rate makes the model more robust, but it requires more boosting rounds (n_estimators) to converge.
  - Typical Range: 0.01 to 0.3
- Number of Trees (n_estimators)
  - Definition: The number of boosting rounds or trees to build.
  - Effect: More trees generally improve accuracy but increase the risk of overfitting and computational cost.
  - Typical Range: 100 to 1000 (depending on the dataset and other parameters).
- Maximum Depth of Trees (max_depth)
  - Definition: Controls the maximum depth of each tree.
  - Effect: Deeper trees capture more complex relationships but are prone to overfitting.
  - Typical Range: 3 to 10 (higher values can lead to overfitting).
- Subsample (subsample)
  - Definition: Fraction of training data used for each tree.
  - Effect: Helps prevent overfitting by building each tree on a random subset of data. Lower values make the model more robust but can slow down convergence.
  - Typical Range: 0.5 to 1.0 (0.8 is common).

Handle imbalanced data:

- Scale Pos Weight (scale_pos_weight)
  - Definition: Balances the positive and negative classes when they are imbalanced.
  - Effect: Particularly useful for imbalanced classification problems by assigning higher weight to the minority class.
  - Typical Range: Depends on the ratio of classes, typically set as #negative examples / #positive examples.


In [4]:
# Define the K-Fold cross-validator (K=5 in this example)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# DataFrame that saves the parameters with accuracies
accuracies_df = pd.DataFrame(
    columns=[
        "learning_rate",
        "n_estimators",
        "max_depth",
        "subsample",
        "min_child_weight",
        "gamma",
        "reg_alpha",
        "reg_lambda",
        "scale_pos_weight",
        "accuracy",
    ]
)

# Handle imbalanced data
scale_pos_weight = sum(y == 0) / sum(y == 1)

## Key parameter tuning

learning_rate, n_estimators, max_depth, subsample


In [5]:
lr_values = [0.99, 0.1, 0.11, 0.12, 0.13, 0.14]
n_est_values = [50, 100]
max_dep_values = [5, 6, 7]
subsam_values = [0.5, 0.6, 0.7, 0.8, 0.9]


for lr in lr_values:
    for n_est in n_est_values:
        for max_dep in max_dep_values:
            for subsam in subsam_values:
                acc_list = []
                for train_index, val_index in kf.split(X):
                    # Split the data into training and testing sets
                    X_train, X_val = X[train_index], X[val_index]
                    y_train, y_val = y[train_index], y[val_index]

                    # Init the model in every iteration
                    model = xgb.XGBClassifier(
                        learning_rate=lr,
                        n_estimators=n_est,
                        max_depth=max_dep,
                        subsample=subsam,
                        scale_pos_weight=scale_pos_weight,
                    )

                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_val)

                    acc_list.append(sum(y_pred == y_val) / len(y_val))

                print(
                    f"learning_rate: {lr}\t n_estimators: {n_est}\t max_depth: {max_dep}\t subsample: {subsam}\t accuracy: {mean(acc_list)}"
                )

                # Create a DataFrame for the new row
                new_row = pd.DataFrame(
                    [[lr, n_est, max_dep, subsam, scale_pos_weight, mean(acc_list)]],
                    columns=[
                        "learning_rate",
                        "n_estimators",
                        "max_depth",
                        "subsample",
                        "scale_pos_weight",
                        "accuracy",
                    ],
                )

                # Append the row using pd.concat()
                accuracies_df = pd.concat([accuracies_df, new_row], ignore_index=True)

learning_rate: 0.99	 n_estimators: 50	 max_depth: 5	 subsample: 0.5	 accuracy: 0.7991023790094783
learning_rate: 0.99	 n_estimators: 50	 max_depth: 5	 subsample: 0.6	 accuracy: 0.8058376749733225


  accuracies_df = pd.concat([accuracies_df, new_row], ignore_index=True)


learning_rate: 0.99	 n_estimators: 50	 max_depth: 5	 subsample: 0.7	 accuracy: 0.8036030381018141
learning_rate: 0.99	 n_estimators: 50	 max_depth: 5	 subsample: 0.8	 accuracy: 0.7980038917833155
learning_rate: 0.99	 n_estimators: 50	 max_depth: 5	 subsample: 0.9	 accuracy: 0.8058188437637311
learning_rate: 0.99	 n_estimators: 50	 max_depth: 6	 subsample: 0.5	 accuracy: 0.7845207457158998
learning_rate: 0.99	 n_estimators: 50	 max_depth: 6	 subsample: 0.6	 accuracy: 0.7912434875400163
learning_rate: 0.99	 n_estimators: 50	 max_depth: 6	 subsample: 0.7	 accuracy: 0.7968551879982424
learning_rate: 0.99	 n_estimators: 50	 max_depth: 6	 subsample: 0.8	 accuracy: 0.8080785889146946
learning_rate: 0.99	 n_estimators: 50	 max_depth: 6	 subsample: 0.9	 accuracy: 0.809221015629904
learning_rate: 0.99	 n_estimators: 50	 max_depth: 7	 subsample: 0.5	 accuracy: 0.7822610005649363
learning_rate: 0.99	 n_estimators: 50	 max_depth: 7	 subsample: 0.6	 accuracy: 0.7901324461741259
learning_rate: 0.99	 

In [6]:
accuracies_df.sort_values(by="accuracy", ascending=False)

Unnamed: 0,learning_rate,n_estimators,max_depth,subsample,min_child_weight,gamma,reg_alpha,reg_lambda,scale_pos_weight,accuracy
103,0.12,50,7,0.8,,,,,1.605263,0.840638
155,0.14,50,6,0.5,,,,,1.605263,0.839527
99,0.12,50,6,0.9,,,,,1.605263,0.839527
39,0.10,50,6,0.9,,,,,1.605263,0.839520
125,0.13,50,6,0.5,,,,,1.605263,0.839514
...,...,...,...,...,...,...,...,...,...,...
25,0.99,100,7,0.5,,,,,1.605263,0.785657
5,0.99,50,6,0.5,,,,,1.605263,0.784521
20,0.99,100,6,0.5,,,,,1.605263,0.784514
26,0.99,100,7,0.6,,,,,1.605263,0.783378


In [7]:
accuracies_df.to_csv("output/accuracies_tuning.csv")

Extract the parameters giving the highest value

In [8]:
accuracies_df_max_row = accuracies_df.loc[
    accuracies_df["accuracy"] == max(accuracies_df["accuracy"])
]
best_lr = accuracies_df_max_row["learning_rate"].iloc[0]
best_n_est = accuracies_df_max_row["n_estimators"].iloc[0]
best_max_dep = accuracies_df_max_row["max_depth"].iloc[0]
best_subsam = accuracies_df_max_row["subsample"].iloc[0]

## Fine-tuning

Fine-tuning:

- Minimum Child Weight (min_child_weight)
  - Definition: Minimum sum of instance weights (hessian) needed in a child node.
  - Effect: A larger value results in more conservative models, reducing the risk of overfitting by avoiding splitting nodes with insufficient instances.
  - Typical Range: 1 to 10 (increase for highly imbalanced datasets).
- Gamma (gamma)
  - Definition: Minimum loss reduction required for further splitting a node.
  - Effect: A higher value makes the algorithm more conservative, reducing the likelihood of overfitting by forcing the algorithm to focus on more significant splits.
  - Typical Range: 0 to 5 (higher for noisy datasets).
- Regularization (reg_alpha, reg_lambda)
  - Definition:
    - reg_alpha (L1 regularization): Adds L1 regularization, making some weights zero (similar to Lasso regression).
    - reg_lambda (L2 regularization): Adds L2 regularization (similar to Ridge regression).
  - Effect: Helps control model complexity and reduce overfitting, particularly in high-dimensional datasets.
  - Typical Range:
    - reg_alpha: 0 to 1 (higher values reduce complexity).
    - reg_lambda: 1 to 5 (higher values reduce overfitting).


In [9]:
# DataFrame that saves the parameters with accuracies for fine-tuning
accuracies_ft_df = pd.DataFrame(
    columns=[
        "learning_rate",
        "n_estimators",
        "max_depth",
        "subsample",
        "min_child_weight",
        "gamma",
        "reg_alpha",
        "reg_lambda",
        "scale_pos_weight",
        "accuracy",
    ]
)

In [10]:
min_w_values = [1, 1.025, 1.05]
gam_values = [0, 0.025, 0.05]
reg_a_values = [0, 0.025, 0.05]
reg_l_values = [1, 1.05, 1.1]


for min_w in min_w_values:
    for gam in gam_values:
        for reg_a in reg_a_values:
            for reg_l in reg_l_values:
                acc_list = []
                for train_index, val_index in kf.split(X):
                    # Split the data into training and testing sets
                    X_train, X_val = X[train_index], X[val_index]
                    y_train, y_val = y[train_index], y[val_index]

                    # Init the model in every iteration
                    model = xgb.XGBClassifier(
                        learning_rate=best_lr,
                        n_estimators=best_n_est,
                        max_depth=best_max_dep,
                        subsample=best_subsam,
                        min_child_weight=min_w,
                        gamma=gam,
                        reg_alpha=reg_a,
                        reg_lambda=reg_l,
                        scale_pos_weight=scale_pos_weight,
                    )

                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_val)

                    acc_list.append(sum(y_pred == y_val) / len(y_val))

                print(
                    f"min_child_weight: {min_w}\t gamma: {gam}\t reg_alpha: {reg_a}\t reg_lambda: {reg_l}\t accuracy: {mean(acc_list)}"
                )

                # Create a DataFrame for the new row
                new_row = pd.DataFrame(
                    [
                        [
                            best_lr,
                            best_n_est,
                            best_max_dep,
                            best_subsam,
                            min_w,
                            gam,
                            reg_a,
                            reg_l,
                            scale_pos_weight,
                            mean(acc_list),
                        ]
                    ],
                    columns=[
                        "learning_rate",
                        "n_estimators",
                        "max_depth",
                        "subsample",
                        "min_child_weight",
                        "gamma",
                        "reg_alpha",
                        "reg_lambda",
                        "scale_pos_weight",
                        "accuracy",
                    ],
                )

                # Append the row using pd.concat()
                accuracies_ft_df = pd.concat(
                    [accuracies_ft_df, new_row], ignore_index=True
                )

min_child_weight: 1	 gamma: 0	 reg_alpha: 0	 reg_lambda: 1	 accuracy: 0.8406377502981608
min_child_weight: 1	 gamma: 0	 reg_alpha: 0	 reg_lambda: 1.05	 accuracy: 0.8350134957002071


  accuracies_ft_df = pd.concat(


min_child_weight: 1	 gamma: 0	 reg_alpha: 0	 reg_lambda: 1.1	 accuracy: 0.8383905592869249
min_child_weight: 1	 gamma: 0	 reg_alpha: 0.025	 reg_lambda: 1	 accuracy: 0.8350197727700709
min_child_weight: 1	 gamma: 0	 reg_alpha: 0.025	 reg_lambda: 1.05	 accuracy: 0.8361559224154165
min_child_weight: 1	 gamma: 0	 reg_alpha: 0.025	 reg_lambda: 1.1	 accuracy: 0.8417738999435064
min_child_weight: 1	 gamma: 0	 reg_alpha: 0.05	 reg_lambda: 1	 accuracy: 0.8395204318624067
min_child_weight: 1	 gamma: 0	 reg_alpha: 0.05	 reg_lambda: 1.05	 accuracy: 0.8361496453455527
min_child_weight: 1	 gamma: 0	 reg_alpha: 0.05	 reg_lambda: 1.1	 accuracy: 0.8361433682756889
min_child_weight: 1	 gamma: 0.025	 reg_alpha: 0	 reg_lambda: 1	 accuracy: 0.837260686711443
min_child_weight: 1	 gamma: 0.025	 reg_alpha: 0	 reg_lambda: 1.05	 accuracy: 0.8350260498399347
min_child_weight: 1	 gamma: 0.025	 reg_alpha: 0	 reg_lambda: 1.1	 accuracy: 0.8395141547925429
min_child_weight: 1	 gamma: 0.025	 reg_alpha: 0.025	 reg_lamb

In [11]:
accuracies_ft_df.sort_values(by="accuracy", ascending=False)

Unnamed: 0,learning_rate,n_estimators,max_depth,subsample,min_child_weight,gamma,reg_alpha,reg_lambda,scale_pos_weight,accuracy
24,0.12,50,7,0.8,1,0.05,0.05,1,1.605263,0.845126
14,0.12,50,7,0.8,1,0.025,0.025,1.1,1.605263,0.842872
5,0.12,50,7,0.8,1,0,0.025,1.1,1.605263,0.841774
32,0.12,50,7,0.8,1.025,0,0.025,1.1,1.605263,0.841768
62,0.12,50,7,0.8,1.05,0,0.05,1.1,1.605263,0.841761
...,...,...,...,...,...,...,...,...,...,...
80,0.12,50,7,0.8,1.05,0.05,0.05,1.1,1.605263,0.831668
70,0.12,50,7,0.8,1.05,0.025,0.05,1.05,1.605263,0.830538
31,0.12,50,7,0.8,1.025,0,0.025,1.05,1.605263,0.830538
23,0.12,50,7,0.8,1,0.05,0.025,1.1,1.605263,0.829414


In [12]:
accuracies_ft_df.to_csv("output/accuracies_fine_tuning.csv")

Extract the parameters giving the highest value

In [13]:
accuracies_ft_df_max_row = accuracies_ft_df.loc[
    accuracies_ft_df["accuracy"] == max(accuracies_ft_df["accuracy"])
]
best_min_w = accuracies_ft_df_max_row["min_child_weight"].iloc[0]
best_gam = accuracies_ft_df_max_row["gamma"].iloc[0]
best_reg_a = accuracies_ft_df_max_row["reg_alpha"].iloc[0]
best_reg_l = accuracies_ft_df_max_row["reg_lambda"].iloc[0]

## Submit prediction


In [14]:
model = xgb.XGBClassifier(
    learning_rate=best_lr,
    n_estimators=best_n_est,
    max_depth=best_max_dep,
    subsample=best_subsam,
    min_child_weight=best_min_w,
    gamma=best_gam,
    reg_alpha=best_reg_a,
    reg_lambda=best_reg_l,
    scale_pos_weight=scale_pos_weight,
)

model.fit(X, y)
y_pred = model.predict(X_test)

In [15]:
predictions = y_pred

# Saving the predictions
output = pd.DataFrame({"PassengerId": test_data.index, "Survived": predictions})
if not os.path.isdir("output/"):
    os.mkdir("output/")
output.to_csv("output/submission.csv", index=False)

print("Your submission was successfully saved!")

Your submission was successfully saved!
