In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib

# NEW IMPORTS
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer, AddMissingIndicator
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder

## Prepare the data set

In [None]:
# load the data - it is available open source and online
data = pd.read_csv(r'data\raw\train.csv')

# display data
data.head()

In [None]:
# replace interrogation marks by NaN values
data = data.replace('?', np.nan)

In [None]:
# retain only the first cabin if more than 1 are available per passenger
def get_first_cabin(row: str) -> str:
    try:
        return row.split()[0]
    except Exception:
        return np.nan
    
data['cabin'] = data['cabin'].apply(get_first_cabin)

In [None]:
# extracts the title (Mr, Ms, etc) from the name variable
def get_title(passenger: str) -> str:
    if re.search('Mrs', passenger):
        return 'Mrs'
    elif re.search('Mr', passenger):
        return 'Mr'
    elif re.search('Miss', passenger):
        return 'Miss'
    elif re.search('Master', passenger):
        return 'Master'
    else:
        return 'Other'
    
data['title'] = data['name'].apply(get_title)

In [None]:
# cast numerical variables as floats
data['fare'] = data['fare'].astype('float')
data['age'] = data['age'].astype('float')

In [None]:
# drop unnecessary variables
data.drop(labels=['name','ticket', 'boat', 'body','home.dest'], axis=1, inplace=True)

# display data
data.head()

## Separate data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('survived', axis=1),  # predictors
    data['survived'],               # target
    test_size=0.2,                  # 20% test set
    random_state=0)                 # reproducibility

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

## Pipeline

In [None]:
class ExtractLetterTransformer:
    """
    Transformer qui extrait la première lettre de la variable (ex : 'cabin').
    """
    def __init__(self, variable: str):
        self.variable = variable

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.variable in X.columns:
            X[self.variable] = X[self.variable].apply(lambda x: x[0] if pd.notnull(x) else x)
        return X

In [None]:
titanic_pipe = Pipeline([
    ('categorical_imputation', CategoricalImputer(variables=['pclass', 'sex', 'embarked', 'title'], fill_value='missing')),
    ('missing_indicator', AddMissingIndicator(variables=['age', 'fare', 'sibsp', 'parch'])),
    ('median_imputation', MeanMedianImputer(imputation_method='median', variables=['age', 'fare', 'sibsp', 'parch'])),
    ('extract_letter', ExtractLetterTransformer(variable='cabin')),
    ('rare_label_encoder', RareLabelEncoder(tol=0.05, n_categories=1, variables=['pclass', 'sex', 'embarked', 'title', 'cabin'])),
    ('categorical_encoder', OneHotEncoder(drop_last=True, variables=['pclass', 'sex', 'embarked', 'title', 'cabin'])),
    ('scaler', StandardScaler()),
    ('Logit', LogisticRegression(C=0.0005, random_state=0))
])

In [None]:
titanic_pipe.fit(X_train, y_train)

In [None]:
train_class = titanic_pipe.predict(X_train)
train_pred = titanic_pipe.predict_proba(X_train)[:, 1]
print('Train ROC-AUC:', roc_auc_score(y_train, train_pred))
print('Train Accuracy:', accuracy_score(y_train, train_class))

test_class = titanic_pipe.predict(X_test)
test_pred = titanic_pipe.predict_proba(X_test)[:, 1]
print('Test ROC-AUC:', roc_auc_score(y_test, test_pred))
print('Test Accuracy:', accuracy_score(y_test, test_class))