# Titanic Kaggle competition
https://www.kaggle.com/competitions/titanic/overview

In [44]:
from numpy.f2py.symbolic import as_eq
from xgboost import XGBClassifier
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pandas as pd

file_path = './data/titanic/train.csv'

def extract_title(name: str) -> str:
    import re
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        title = title_search.group(1)

        # Group similar titles together
        if title in ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']:
            return 'Rare'
        elif title in ['Mlle', 'Ms']: # French
            return 'Miss'
        elif title == 'Mme': # French
            return 'Mrs'
        else:
            return title
    else:
        return 'Unknown'


from xgboost import XGBClassifier
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

file_path = './data/titanic/train.csv'

def extract_title(name: str) -> str:
    import re
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        title = title_search.group(1)

        # Group similar titles together
        if title in ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']:
            return 'Rare'
        elif title in ['Mlle', 'Ms']: # French
            return 'Miss'
        elif title == 'Mme': # French
            return 'Mrs'
        else:
            return title
    else:
        return 'Unknown'

class FareBinTransformer(BaseEstimator, TransformerMixin):
    """Custom transformer to create fare bins and handle missing values"""

    def __init__(self, n_bins=4, strategy='quantile'):
        self.n_bins = n_bins
        self.strategy = strategy
        self.fare_bins_ = None
        self.fare_bin_labels_ = ['Low', 'Medium', 'High', 'Very_High']
        self.class_medians_ = None

    def fit(self, X, y=None):
        """Learn the fare bins and class-based medians from training data"""
        X_copy = X.copy()

        # First, learn class-based medians for imputing missing fares
        self.class_medians_ = X_copy.groupby('Pclass')['Fare'].median().to_dict()

        # Fill missing fares using class medians
        X_copy['Fare'] = X_copy.apply(
            lambda row: self.class_medians_[row['Pclass']] if pd.isna(row['Fare']) else row['Fare'],
            axis=1
        )

        # Learn fare bins
        if self.strategy == 'quantile':
            # Use quantile-based bins for balanced groups
            self.fare_bins_ = pd.qcut(X_copy['Fare'], q=self.n_bins, retbins=True, duplicates='drop')[1]
        else:
            # Use equal-width bins
            self.fare_bins_ = pd.cut(X_copy['Fare'], bins=self.n_bins, retbins=True)[1]

        return self

    def transform(self, X):
        """Apply fare binning to new data"""
        X_copy = X.copy()

        # Impute missing fares using learned class medians
        X_copy['Fare'] = X_copy.apply(
            lambda row: self.class_medians_.get(row['Pclass'], X_copy['Fare'].median())
            if pd.isna(row['Fare']) else row['Fare'],
            axis=1
        )

        # Apply learned fare bins
        X_copy['FareBin'] = pd.cut(
            X_copy['Fare'],
            bins=self.fare_bins_,
            labels=self.fare_bin_labels_[:len(self.fare_bins_)-1],
            include_lowest=True
        )

        # Handle any values outside learned range (shouldn't happen but safety first)
        X_copy['FareBin'] = X_copy['FareBin'].fillna('Medium')

        return X_copy

class TitleExtractorTransformer(BaseEstimator, TransformerMixin):
    """Custom transformer to extract titles from names"""

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy['Title'] = X_copy['Name'].apply(extract_title)
        return X_copy

def extract_ticket_nuber(x):
    return x.split(' ')[-1]

def extract_ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

def retrieve_sanitised_data_frame(file_path) -> Tuple[pd.DataFrame, pd.Series]:
    titanic_passenger_data = pd.read_csv(file_path).replace(['', ' ', '  '], np.nan)

    y = titanic_passenger_data.get('Survived', None)

    # Convert strings to numeric where possible
    # titanic_passenger_data = titanic_passenger_data.apply(pd.to_numeric, errors='ignore')
    # titanic_passenger_data['SibSp'] = titanic_passenger_data['SibSp'].astype('float64')
    # titanic_passenger_data['Parch'] = titanic_passenger_data['Parch'].astype('float64')
    # titanic_passenger_data['Age'] = titanic_passenger_data['Age'].astype('float64')
    #
    # # Basic feature engineering (keep this outside pipeline for simplicity)
    # titanic_passenger_data['FamilySize'] = titanic_passenger_data['SibSp'] + titanic_passenger_data['Parch']
    # titanic_passenger_data['IsAlone'] = (titanic_passenger_data['FamilySize'] == 0).astype(int)

    # titanic_passenger_data['TicketNumber'] = titanic_passenger_data['Ticket'].apply(extract_ticket_nuber)
    # titanic_passenger_data['TicketItem'] = titanic_passenger_data['Ticket'].apply(extract_ticket_item)

    feature_names = titanic_passenger_data.columns
    X = titanic_passenger_data[feature_names].drop(
        ['PassengerId', 'Survived', 'Cabin'],
        axis=1,
        errors='ignore'
    )

    X = X.drop(['Ticket'], axis=1)

    return X, y

X, y = retrieve_sanitised_data_frame(file_path)
print(X.columns)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

feature_pipeline = Pipeline([
    ('title_extractor', TitleExtractorTransformer()),
    ('fare_binner', FareBinTransformer(n_bins=4, strategy='quantile')),
])

# Fit feature pipeline on training data to see resulting columns
# X_train_engineered = feature_pipeline.fit_transform(X_train)
# print(f"Columns after feature engineering: {list(X_train_engineered.columns)}")
# print(f"Data types: {X_train_engineered.dtypes}")

mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

preprocessor = ColumnTransformer(
    transformers=[
        ('median_imputer', median_imputer, ['Age']),
        ('zero_imputer', zero_imputer, ['SibSp']),
        ('categorical', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # There are some missing Embarked rows
             # List all categories explicitly. When we create fold this prevents unknown category issues
            ('encoder', OneHotEncoder(categories=[
                ['female', 'male'], # Sex
                ['C', 'Q', 'S'], # Embarked
                # ['Mr', 'Mrs', 'Miss', 'Master', 'Rare'], # Title
                # ['Low', 'Medium', 'High', 'Very_High'] # FareBin
            ]))
        ]), ['Sex', 'Embarked',
             # 'Title', 'FareBin'
             ]),
        ('fare_imputer', SimpleImputer(strategy='median'), ['Fare']),
        ('passthrough', 'passthrough', ['Pclass', 'Parch']),
    ])

xgbModel = XGBClassifier(
   learning_rate=0.05,
    max_depth=5,
    n_estimators=100,
    reg_alpha=0.1,
    reg_lambda=0.5,
    subsample=0.8,
    n_jobs=4, # Parallelisation - number of CPU cores to use (4 cores in this case)
    random_state=1
)
my_pipeline = Pipeline(steps=[
    # ('feature_engineering', feature_pipeline),
    ('preprocessor', preprocessor),
    ('model', xgbModel)
])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

accuracy = accuracy_score(y_valid, preds)
print('Accuracy:', accuracy)

print('\nClassification Report:')
print(classification_report(y_valid, preds))

print('\nConfusion Matrix:')
print(confusion_matrix(y_valid, preds))

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
Accuracy: 0.7937219730941704

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.95      0.84       128
           1       0.90      0.58      0.71        95

    accuracy                           0.79       223
   macro avg       0.83      0.77      0.77       223
weighted avg       0.82      0.79      0.78       223


Confusion Matrix:
[[122   6]
 [ 40  55]]


In [7]:
# Feature Importance Analysis
feature_names = my_pipeline.named_steps['preprocessor'].get_feature_names_out()
importances = my_pipeline.named_steps['model'].feature_importances_

# Create feature importance dataframe
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print(f'\n=== TOP 10 FEATURE IMPORTANCES ===')
print(feature_importance_df.head(10))


=== TOP 10 FEATURE IMPORTANCES ===
                   feature  importance
2  categorical__Sex_female    0.493289
7      passthrough__Pclass    0.129119
6  categorical__Embarked_S    0.102587
1      zero_imputer__SibSp    0.065006
0      median_imputer__Age    0.061901
4  categorical__Embarked_C    0.060091
5  categorical__Embarked_Q    0.056258
8       passthrough__Parch    0.031750
3    categorical__Sex_male    0.000000


In [43]:
# FIXED HYPERPARAMETER TUNING - PRESERVING YOUR SPLIT
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.base import clone

def print_best_params_as_code(grid_search, model_name="best_model"):
    """Print the best parameters as copy-pastable Python code"""
    params = grid_search.best_params_

    print(f"\n# Best parameters found:")
    print(f"{model_name} = XGBClassifier(")

    # Extract model parameters (remove 'model__' prefix)
    model_params = {}
    for key, value in params.items():
        if key.startswith('model__'):
            clean_key = key.replace('model__', '')
            model_params[clean_key] = value

    # Print each parameter
    for key, value in model_params.items():
        if isinstance(value, str):
            print(f"    {key}='{value}',")
        else:
            print(f"    {key}={value},")

    print(")")

print(f'\n=== HYPERPARAMETER TUNING ===')

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 4, 5],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__subsample': [0.8, 0.9],
    'model__reg_alpha': [0, 0.1],
    'model__reg_lambda': [0.5, 1.0]
}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    clone(my_pipeline),
    param_grid,
    cv=cv_strategy,      # This will split X_train into 5 folds internally
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("Starting grid search...")

# ONLY use X_train, y_train - GridSearch will do CV internally on this
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print_best_params_as_code(grid_search)

# NOW use X_valid for monitoring (don't change hyperparameters based on this!)
best_preds = grid_search.predict(X_valid)
best_accuracy = accuracy_score(y_valid, best_preds)
best_f1 = f1_score(y_valid, best_preds, average='macro')

print(f'\n=== BEST MODEL PERFORMANCE ===')
print(f'Best CV Score: {grid_search.best_score_:.4f}')
print(f'Validation Accuracy: {best_accuracy:.4f}')
print(f'Validation F1: {best_f1:.4f}')

# Check for overfitting
cv_val_gap = abs(grid_search.best_score_ - best_accuracy)
print(f'CV-Validation Gap: {cv_val_gap:.4f}')

if cv_val_gap > 0.05:
    print("⚠️  WARNING: Large gap suggests overfitting!")
else:
    print("✅ Good generalization")


=== HYPERPARAMETER TUNING ===
Starting grid search...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 100, 'model__reg_alpha': 0.1, 'model__reg_lambda': 0.5, 'model__subsample': 0.8}

# Best parameters found:
best_model = XGBClassifier(
    learning_rate=0.05,
    max_depth=5,
    n_estimators=100,
    reg_alpha=0.1,
    reg_lambda=0.5,
    subsample=0.8,
)

=== BEST MODEL PERFORMANCE ===
Best CV Score: 0.8399
Validation Accuracy: 0.7937
Validation F1: 0.7733
CV-Validation Gap: 0.0462
✅ Good generalization


In [45]:
# Prepare submission

# Retrain model fully
my_pipeline.fit(X, y)

test_data_file_path = './data/titanic/test.csv'
X_test, _ = retrieve_sanitised_data_frame(test_data_file_path)

predictions_on_test_data = my_pipeline.predict(X_test)

# For PassengerId, read it separately since we drop it in retrieve_sanitised_data_frame()

test_data_raw = pd.read_csv(test_data_file_path)
output = pd.DataFrame({'PassengerId': test_data_raw.PassengerId, 'Survived': predictions_on_test_data})
output.to_csv('./data/titanic/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


# Steps

- [x] Load data
- [x] Select columns we want
- [x] Temporarily exclude cabin and ticket
- [ ] Impute age
    - [x] Mean
    - [ ] Smarter - can we work this out from name? Ticket price? Location
- [x] One-hot encode sex
- [ ] Get smart with ticket and cabin
    - [x] Cabin tuning <-- Discovered its better to just drop the cabin! As its very sparsely populated in test set
- [x] Create decision tree
    - [x] Tune decision tree
- [x] XGBoost
- [x] Try cross fold validation <-- Not doing this as we already have a test set