# Titanic Kaggle competition
https://www.kaggle.com/competitions/titanic/overview

In [40]:
from scipy.signal import qspline1d
from xgboost import XGBClassifier
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pandas as pd

file_path = './data/titanic/train.csv'

def retrieve_sanitised_data_frame(file_path) -> Tuple[pd.DataFrame, pd.Series]:
    titanic_passenger_data = pd.read_csv(file_path).replace(['', ' ', '  '], np.nan)

    y = titanic_passenger_data.get('Survived', None) # handle case where it doesnt exist (test_data)

    feature_names = titanic_passenger_data.columns
    X = titanic_passenger_data[feature_names].drop(
        ['Name', 'PassengerId', 'Survived', 'Cabin'],
        axis=1,
        errors='ignore' # If we fail to drop survived we dont care
    ) # Names probs not useful

    X = X.drop(['Ticket'], axis=1) # TODO: This is temporary just to get up and running

    return X, y

X, y = retrieve_sanitised_data_frame(file_path)
print(X.columns)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

print(X[['Sex', 'Embarked']].isnull().sum())

mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

preprocessor = ColumnTransformer(
    transformers=[
        # ('mean_imputer', mean_imputer, ['Age']),
        ('median_imputer', median_imputer, ['Age']),
        ('zero_imputer', zero_imputer, ['SibSp']),
        ('categorical', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # There are some missing Embarked rows
             # List all categories explicitly. When we create fold this prevents unknown category issues
            ('encoder', OneHotEncoder(categories=[['female', 'male'], ['C', 'Q', 'S']]))
        ]), ['Sex', 'Embarked']),
    ])

xgbModel = XGBClassifier(
    learning_rate=0.01,
    max_depth=3,
    n_estimators=300,
    subsample=0.8,
    n_jobs=4, # Parallelisation - number of CPU cores to use (4 cores in this case)
)
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgbModel)
                             ])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

accuracy = accuracy_score(y_valid, preds)
print('Accuracy:', accuracy)

print('\nClassification Report:')
print(classification_report(y_valid, preds))

print('\nConfusion Matrix:')
print(confusion_matrix(y_valid, preds))

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
Sex         0
Embarked    2
dtype: int64
Accuracy: 0.7847533632286996

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       128
           1       0.81      0.65      0.72        95

    accuracy                           0.78       223
   macro avg       0.79      0.77      0.77       223
weighted avg       0.79      0.78      0.78       223


Confusion Matrix:
[[113  15]
 [ 33  62]]


In [39]:
#Grid Search for Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

def print_best_params_as_code(grid_search, model_name="best_model"):
    """Print the best parameters as copy-pastable Python code"""
    params = grid_search.best_params_

    print(f"\n# Best parameters found:")
    print(f"{model_name} = XGBClassifier(")

    # Extract model parameters (remove 'model__' prefix)
    model_params = {}
    for key, value in params.items():
        if key.startswith('model__'):
            clean_key = key.replace('model__', '')
            model_params[clean_key] = value

    # Print each parameter
    for key, value in model_params.items():
        if isinstance(value, str):
            print(f"    {key}='{value}',")
        else:
            print(f"    {key}={value},")

    print(")")

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 4, 5, 6],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 0.9, 1.0]
}

# Fit the preprocessor with ALL training data. That way when we split it into folds below, we dont end up with unseen categories
preprocessor.fit(X_train)

# Then use in GridSearch
grid_search = GridSearchCV(
    Pipeline([('preprocessor', preprocessor), ('model', xgbModel)]),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all available CPU cores
    verbose=1 # Progress info
)

grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print_best_params_as_code(grid_search)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 300, 'model__subsample': 0.8}

# Best parameters found:
best_model = XGBClassifier(
    learning_rate=0.01,
    max_depth=3,
    n_estimators=300,
    subsample=0.8,
)


In [98]:
# Prepare submission

# Retrain model fully
my_pipeline.fit(X, y)

test_data_file_path = './data/titanic/test.csv'
test_data = pd.read_csv(test_data_file_path).replace(['', ' ', '  '], np.nan)

predictions_on_test_data = my_pipeline.predict(test_data)

# Save to CSV
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions_on_test_data})
output.to_csv('./data/titanic/submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!


# Steps

- [x] Load data
- [x] Select columns we want
- [x] Temporarily exclude cabin and ticket
- [ ] Impute age
    - [x] Mean
    - [ ] Smarter - can we work this out from name? Ticket price? Location
- [x] One-hot encode sex
- [ ] Get smart with ticket and cabin
    - [x] Cabin tuning <-- Discovered its better to just drop the cabin! As its very sparsely populated in test set
- [x] Create decision tree
    - [x] Tune decision tree
- [x] XGBoost
- [x] Try cross fold validation <-- Not doing this as we already have a test set