# Titanic Kaggle competition
https://www.kaggle.com/competitions/titanic/overview

In [75]:
from xgboost import XGBClassifier
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pandas as pd

file_path = './data/titanic/train.csv'

def extract_title(name: str) -> str:
    import re
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        title = title_search.group(1)

        # Group similar titles together
        if title in ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']:
            return 'Rare'
        elif title in ['Mlle', 'Ms']: # French
            return 'Miss'
        elif title == 'Mme': # French
            return 'Mrs'
        else:
            return title
    else:
        return 'Unknown'


def retrieve_sanitised_data_frame(file_path) -> Tuple[pd.DataFrame, pd.Series]:
    titanic_passenger_data = pd.read_csv(file_path).replace(['', ' ', '  '], np.nan)

    y = titanic_passenger_data.get('Survived', None) # handle case where it doesnt exist (test_data)

    # TODO: Ideally I put this in the pipeline
    # Try convert strings to int
    titanic_passenger_data = titanic_passenger_data.apply(pd.to_numeric, errors='ignore')
    titanic_passenger_data['SibSp'] = titanic_passenger_data['SibSp'].astype('float64')
    titanic_passenger_data['Parch'] = titanic_passenger_data['Parch'].astype('float64')
    titanic_passenger_data['Age'] = titanic_passenger_data['Age'].astype('float64')

    # Do feature engineering here on clean data
    titanic_passenger_data['FamilySize'] = titanic_passenger_data['SibSp'] + titanic_passenger_data['Parch']
    titanic_passenger_data['IsAlone'] = (titanic_passenger_data['FamilySize'] == 0).astype(int)
    titanic_passenger_data['Title'] = titanic_passenger_data['Name'].apply(extract_title)

    feature_names = titanic_passenger_data.columns
    X = titanic_passenger_data[feature_names].drop(
        ['Name', 'PassengerId', 'Survived', 'Cabin'],
        axis=1,
        errors='ignore' # If we fail to drop survived we dont care
    ) # Names probs not useful

    X = X.drop(['Ticket'], axis=1) # TODO: This is temporary just to get up and running

    return X, y

X, y = retrieve_sanitised_data_frame(file_path)
print(X.columns)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

preprocessor = ColumnTransformer(
    transformers=[
        ('median_imputer', median_imputer, ['Age']),
        ('zero_imputer', zero_imputer, ['SibSp']),
        ('categorical', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # There are some missing Embarked rows
             # List all categories explicitly. When we create fold this prevents unknown category issues
            ('encoder', OneHotEncoder(categories=[
                ['female', 'male'], # Sex
                ['C', 'Q', 'S'], # Embarked
                ['Mr', 'Mrs', 'Miss', 'Master', 'Rare'] # Title
            ]))
        ]), ['Sex', 'Embarked', 'Title']),
    ])

xgbModel = XGBClassifier(
    learning_rate=0.01,
    max_depth=3,
    n_estimators=300,
    subsample=0.8,
    n_jobs=4, # Parallelisation - number of CPU cores to use (4 cores in this case)
)
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgbModel)
                             ])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

accuracy = accuracy_score(y_valid, preds)
print('Accuracy:', accuracy)

print('\nClassification Report:')
print(classification_report(y_valid, preds))

print('\nConfusion Matrix:')
print(confusion_matrix(y_valid, preds))

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
       'FamilySize', 'IsAlone', 'Title'],
      dtype='object')
Accuracy: 0.7892376681614349

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       128
           1       0.81      0.66      0.73        95

    accuracy                           0.79       223
   macro avg       0.79      0.77      0.78       223
weighted avg       0.79      0.79      0.79       223


Confusion Matrix:
[[113  15]
 [ 32  63]]


  titanic_passenger_data = titanic_passenger_data.apply(pd.to_numeric, errors='ignore')


In [72]:
#Grid Search for Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

def print_best_params_as_code(grid_search, model_name="best_model"):
    """Print the best parameters as copy-pastable Python code"""
    params = grid_search.best_params_

    print(f"\n# Best parameters found:")
    print(f"{model_name} = XGBClassifier(")

    # Extract model parameters (remove 'model__' prefix)
    model_params = {}
    for key, value in params.items():
        if key.startswith('model__'):
            clean_key = key.replace('model__', '')
            model_params[clean_key] = value

    # Print each parameter
    for key, value in model_params.items():
        if isinstance(value, str):
            print(f"    {key}='{value}',")
        else:
            print(f"    {key}={value},")

    print(")")

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 4, 5, 6],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 0.9, 1.0]
}

# Fit the preprocessor with ALL training data. That way when we split it into folds below, we dont end up with unseen categories
preprocessor.fit(X_train)

# Then use in GridSearch
grid_search = GridSearchCV(
    Pipeline([('preprocessor', preprocessor), ('model', xgbModel)]),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all available CPU cores
    verbose=1 # Progress info
)

grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print_best_params_as_code(grid_search)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [76]:
# Prepare submission

# Retrain model fully
my_pipeline.fit(X, y)

test_data_file_path = './data/titanic/test.csv'
X_test, _ = retrieve_sanitised_data_frame(test_data_file_path)

predictions_on_test_data = my_pipeline.predict(X_test)

# For PassengerId, read it separately since we drop it in retrieve_sanitised_data_frame()

test_data_raw = pd.read_csv(test_data_file_path)
output = pd.DataFrame({'PassengerId': test_data_raw.PassengerId, 'Survived': predictions_on_test_data})
output.to_csv('./data/titanic/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


  titanic_passenger_data = titanic_passenger_data.apply(pd.to_numeric, errors='ignore')


# Steps

- [x] Load data
- [x] Select columns we want
- [x] Temporarily exclude cabin and ticket
- [ ] Impute age
    - [x] Mean
    - [ ] Smarter - can we work this out from name? Ticket price? Location
- [x] One-hot encode sex
- [ ] Get smart with ticket and cabin
    - [x] Cabin tuning <-- Discovered its better to just drop the cabin! As its very sparsely populated in test set
- [x] Create decision tree
    - [x] Tune decision tree
- [x] XGBoost
- [x] Try cross fold validation <-- Not doing this as we already have a test set