# The Titanic Challenge

You should submit a csv file with exactly 418 entries plus a header row.

Current Score: 0.77751
Rank: 3909/14441 (Top 27%)

In [None]:
# TODO: List of improvements
# 1. Mutual Importance of Features
# 2. Data Scaling
# 3. Using another model
# 4. Using PCA ?
# 5. Use DNN 
# 6. Use Double-Stage Model

In [1]:
# import libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold

In [2]:
# load data 
train = pd.read_csv('data/titanic_train.csv') # training data
test = pd.read_csv('data/titanic_test.csv') # test data

In [3]:
# Feature Engineering

def split_ticket(ticket: str) -> pd.Series:
    """ Split Ticket with Destination and Ticket number """
    result = ['U', ticket]
    if ' ' in ticket:
        # sometimes we have 2 spaces, we need split only by second space
        if ticket.count(' ') > 1:
            result = ticket.rsplit(' ', 1)
        else:
            result = ticket.split(' ')
    result[0] = result[0][0] # get first letter of Destination

    return pd.Series(result)

def feature_engineering(df):
    # Convert Cabin to boolean
    df['Cabin']  = df['Cabin'].notna()
    
    # Split Ticket with Destination and Ticket number
    df[['Destination', 'TicketNumber']] = df['Ticket'].apply(split_ticket)

    # Transform Cabin into Boolean
    df['Cabin'] = df['Cabin'].notna()

    # Cut Family Size into groups
    family_group = ['Alone', 'Small', 'Middle', 'Big']
    df['FamilySize'] = pd.cut(df['SibSp'] + df['Parch'] + 1, # Calculate Family Size
                                     [0, 1, 4, 7, 11], # Define Family Size Groups
                                     labels=family_group) # Assign Family Size Groups

feature_engineering(train)
feature_engineering(test)

In [4]:
# split data
X_train, X_test, y_train, y_test = train_test_split(train.drop(['Survived', 'Name'], axis=1),
                                                    train['Survived'],
                                                    test_size=0.2,
                                                    random_state=42)

In [5]:

# preprocessing data
imp_cols = ['Age']                                               # imputer columns
ode_cols = ['FamilySize', 'Destination']                                        # ordinal columns
ohe_cols = ['Sex', 'Embarked', 'Pclass'] # one hot encoded columns
useless_col = ['Ticket', 'SibSp', 'Parch', 'Cabin']              # useless columns
pass_through = ['Pclass', 'Cabin', 'Age', 'Fare']              # pass through columns

imp_pipeline = Pipeline([                                        # This pipeline will fill missing numerical values with most frequent value
    ('imputer', SimpleImputer(strategy='mean'))
])

ordinal_pipeline = Pipeline([                                    # This pipeline will fill missing values with most frequent value and encode ordinal columns
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

ohe_pipeline = Pipeline([                                        # This pipeline will fill missing values with most frequent value and encode one hot encoded columns
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

col_transformer = ColumnTransformer(transformers=
    [
        ('imp_pipeline',imp_pipeline, imp_cols),
        ('ord_pipeline', ordinal_pipeline, ode_cols),
        ('ohe_pipeline', ohe_pipeline, ohe_cols),
        ('passthrough', 'passthrough', pass_through)
    ],
    remainder='drop', n_jobs=-1
)

In [6]:
import traceback


# Tuning the model with best params and features by RandomizedSearchCV
params_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
cv_rfc = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=params_grid,
    cv=StratifiedKFold(n_splits=5),
    n_iter=10,
    n_jobs=-1,
    verbose=1
    
)

pipeline = make_pipeline(col_transformer, cv_rfc)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# # Get the best params and features
best_params = cv_rfc.best_params_
best_model = cv_rfc.best_estimator_
model_score = cv_rfc.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [7]:
# Show Report
print('Model Score: ', model_score)
print('Accuracy Score:', accuracy_score(y_test, y_pred)) # best score 0.81 % 

Model Score:  0.8272333300502315
Accuracy Score: 0.8324022346368715


In [8]:
# Save Submission
y = test.drop(['Name'], axis=1)
predictions = pipeline.predict(y)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('data/titanic_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
