# Titanic Kaggle competition
https://www.kaggle.com/competitions/titanic/overview

In [93]:
from xgboost import XGBClassifier
from typing import Tuple
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_splitfd, train_test_split
import pandas as pd

file_path = './data/titanic/train.csv'

def retrieve_sanitised_data_frame(file_path) -> Tuple[pd.DataFrame, pd.Series]:
    titanic_passenger_data = pd.read_csv(file_path).replace(['', ' ', '  '], np.nan)

    y = titanic_passenger_data.get('Survived', None) # handle case where it doesnt exist (test_data)

    feature_names = titanic_passenger_data.columns
    X = titanic_passenger_data[feature_names].drop(
        ['Name', 'PassengerId', 'Survived', 'Cabin'],
        axis=1,
        errors='ignore' # If we fail to drop survived we dont care
    ) # Names probs not useful

    X = X.drop(['Ticket'], axis=1) # TODO: This is temporary just to get up and running

    return X, y

X, y = retrieve_sanitised_data_frame(file_path)
print(X.columns)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

mean_imputer = SimpleImputer(strategy='mean') # Temporary
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

preprocessor = ColumnTransformer(
    transformers=[
        ('mean_imputer', mean_imputer, ['Age']),
        ('zero_imputer', zero_imputer, ['SibSp']),
        ('one_hot_incoder', OneHotEncoder(), ['Sex', 'Embarked', 'PClass']),
    ])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

xgbModel = XGBClassifier(
  n_estimators=300, # Number of trees in forest
  learning_rate=0.01, # Step size shrinkage used in update to prevents overfitting
  n_jobs=4, # Parallelisation - number of CPU cores to use (4 cores in this case)
)
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgbModel)
                             ])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)

accuracy = accuracy_score(y_valid, preds)
print('Accuracy:', accuracy)

print('\nClassification Report:')
print(classification_report(y_valid, preds))

print('\nConfusion Matrix:')
print(confusion_matrix(y_valid, preds))

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
Accuracy: 0.7847533632286996

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       128
           1       0.81      0.65      0.72        95

    accuracy                           0.78       223
   macro avg       0.79      0.77      0.77       223
weighted avg       0.79      0.78      0.78       223


Confusion Matrix:
[[113  15]
 [ 33  62]]


In [94]:
# Prepare submission
test_data_file_path = './data/titanic/test.csv'
test_data = pd.read_csv(test_data_file_path).replace(['', ' ', '  '], np.nan)

predictions_on_test_data = my_pipeline.predict(test_data)

# Save to CSV
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions_on_test_data})
output.to_csv('./data/titanic/submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!


# Steps

- [x] Load data
- [x] Select columns we want
- [x] Temporarily exclude cabin and ticket
- [ ] Impute age
    - [x] Mean
    - [ ] Smarter - can we work this out from name? Ticket price? Location
- [x] One-hot encode sex
- [ ] Get smart with ticket and cabin
- [x] Create decision tree
    - [ ] Tune decision tree
- [x] XGBoost
- [x] Try cross fold validation <-- Not doing this as we already have a test set