# The Titanic Challenge

Current Score: 0.77990
Rank: 3218/13637 - TOP 25%  (% 23.6)

In [None]:
import re

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# load data
train_path = '/kaggle/input/titanic/train.csv'
test_path = '/kaggle/input/titanic/test.csv'
output_path = '/kaggle/working/submission.csv'

train_data = pd.read_csv(train_path) # training data
test_data = pd.read_csv(test_path) # test data
test_pass_id = test_data['PassengerId'] # save PassengerId for submission

In [None]:
def feature_engineering(_train_data, _test_data):
    # Example feature engineering: creating a family size feature
    for dataset in [_train_data, _test_data]:

        # Create the variable “FamilySize” by combining the variables “SibSp” and “Parch”.
        dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

        # Map ‘Embarqued’ to Embarqued_code{1,2,3}. reserve the zero for NaN.
        dataset.loc[dataset['Embarked'].isna(), 'Embarked'] = 'S'  # (Encyclopedia titanica)
        dataset['Embarked_code'] = dataset['Embarked'].map({'C': 1, 'Q': 2, 'S': 3}).astype(int)

        # Group the duplicated Ticket values and count the number of people that traveled together including friends, maids, and nannies, and create the variable “Companions”.
        dataset['Companions'] = dataset['Ticket'].duplicated(keep=False).astype(int) * dataset.groupby('Ticket')['Ticket'].transform(
            'count') - 1
        dataset.loc[dataset['Companions'] == -1, 'Companions'] = 0

        # Create the variable “Title” by extracting the title from the variable “Name”.
        def get_title(name):
            title_search = re.search(' ([A-Za-z]+)\.', name)
            # If the title exists, extract and return it.
            if title_search:
                return title_search.group(1)
            return ""

        dataset['Title'] = dataset['Name'].apply(get_title)
        # Clean the variable Title.
        dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                           'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'noble')
        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Mrs')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

        # Divide Fare in 10 levels
        n = 10
        dataset['Fare_level'] = pd.cut(dataset['Fare'], n, labels=np.arange(1, n + 1))

        # get the last cabin if exists
        dataset['Last_cabin'] = dataset['Cabin'].apply(lambda x: str(x).split()[-1] if pd.notnull(x) else 'N')
        # get the Cabin_label of the last cabin
        pattern = r'([A-Za-z])'
        dataset['Cabin_label'] = dataset['Last_cabin'].apply(lambda x: re.search(pattern, str(x)).group(1) if x != 'N' else 'N')
        # Get the number of the last cabin if exists
        pattern = r'(\d+)'
        dataset['Cabin_number'] = dataset['Last_cabin'].apply(lambda x: re.search(pattern, str(x)) if x != 'N' else -1)
        # Get the number of distinct cabins reserved by the passenger.
        dataset['Cabin_count'] = dataset['Cabin'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
        # drop not used columns and columns with NaN values.
        dataset.drop(columns=['Cabin', 'Last_cabin', 'Cabin_number'], inplace=True)

        # Mapping Age
        # fill the missing age with mean
        dataset['Age'].fillna(dataset['Age'].mean(), inplace=True)
        # Create Age_group feature
        dataset.loc[dataset['Age'] <= 13, 'Age_group'] = 0  # kids
        dataset.loc[(dataset['Age'] > 13) & (dataset['Age'] <= 30), 'Age_group'] = 1  # young
        dataset.loc[(dataset['Age'] > 30) & (dataset['Age'] <= 45), 'Age_group'] = 2  # mature1
        dataset.loc[(dataset['Age'] > 45) & (dataset['Age'] <= 60), 'Age_group'] = 3  # old
        dataset.loc[(dataset['Age'] > 60) & (dataset['Age'] <= 100), 'Age_group'] = 4  # very old

        dataset.drop(columns=['PassengerId', 'Age', 'Embarked', 'Ticket', 'Name'], inplace=True)

    return _train_data, _test_data

In [None]:
train_data, test_data = feature_engineering(train_data, test_data)

In [None]:
X, y = train_data.drop('Survived', axis=1), train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

In [None]:
best_params = {
        'n_estimators': 315,
        'max_depth': 14,
        'min_samples_split': 12,
        'min_samples_leaf': 2,
        'max_features': 'sqrt',
        'criterion': 'gini',
        'bootstrap': True,
        'class_weight': None,
    }
estimator = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
pipeline = make_pipeline(preprocessor, estimator)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
model_score = accuracy_score(y_test, y_pred)

In [None]:
print('Model Accuracy: {:.2f}%'.format(model_score * 100))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
print('Cross-Validation Accuracy: {:.2f}% (+/- {:.2f}%)'.format(
    cv_scores.mean() * 100, cv_scores.std() * 100))

In [None]:
# let's predict on the test set
pass_id = test_pass_id
y_rest_predict = pipeline.predict(test_data)

In [None]:
# Save Submission
output = pd.DataFrame({'PassengerId': pass_id, 'Survived': y_rest_predict})
output.to_csv(output_path, index=False)
print("Your submission was successfully saved!")