# Test Titanic Survival Pipeline

In [404]:
## Load test data
# pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl').to_csv("test_data.csv", header=True, index=False)

## Libraries

In [405]:
import re

# to handle datasets
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# to build the models
from sklearn.linear_model import LogisticRegression

# to evaluate the models
from sklearn.metrics import accuracy_score, roc_auc_score

# to persist the model and the scaler
import joblib

# ========== NEW IMPORTS ========
# Respect to notebook 02-Predicting-Survival-Titanic-Solution

# pipeline
from sklearn.pipeline import Pipeline

# for the preprocessors
from sklearn.base import BaseEstimator, TransformerMixin

# for imputation
from feature_engine.imputation import (
    CategoricalImputer,
    AddMissingIndicator,
    MeanMedianImputer)

# for encoding categorical variables
from feature_engine.encoding import (
    RareLabelEncoder,
    OneHotEncoder
)

from joblib import load

class ExtractLetterTransformer(BaseEstimator, TransformerMixin):
    # Extract fist letter of variable

    def __init__(self, variables):
        
        if isinstance(variables, list) == False:
            raise ValueError("Parameter 'variables' has to be a list")
        
        self.variables = variables
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        X = X.copy()
        
        for var in self.variables:
            X[var] = X[var].str[0]
        
        return X

## Load model

In [406]:
titanic_pipe = load("titanic_survival_pipeline.joblib")
model = load("titanic_survival_model.joblib")

## Load test data from csv

In [407]:
test_data = pd.read_csv("test_data.csv")

In [408]:
test_data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",x,29,0,0,24160,211.3375,B5,S,2.0,?,"St Louis, MO"
1,1,1,"Allen, Miss. Elisabeth Walton",,-1,0,0,24160,240.0,-120,,,,


## Prepare the dataset

In [409]:
# replace interrogation marks by NaN values

data = test_data.replace('?', np.nan)

In [410]:
# retain only the first cabin if more than
# 1 are available per passenger

def get_first_cabin(row):
    try:
        return row.split()[0]
    except:
        return np.nan
    
data['cabin'] = data['cabin'].apply(get_first_cabin)

In [411]:
# extracts the title (Mr, Ms, etc) from the name variable

def get_title(passenger):
    line = passenger
    if re.search('Mrs', line):
        return 'Mrs'
    elif re.search('Mr', line):
        return 'Mr'
    elif re.search('Miss', line):
        return 'Miss'
    elif re.search('Master', line):
        return 'Master'
    else:
        return 'Other'
    
data['title'] = data['name'].apply(get_title)

In [412]:
# cast numerical variables as floats

data['fare'] = data['fare'].astype('float')
data['age'] = data['age'].astype('float')

In [413]:
# drop unnecessary variables

data.drop(labels=['name','ticket', 'boat', 'body','home.dest'], axis=1, inplace=True)

# display data
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,title
0,1,1,x,29.0,0,0,211.3375,B5,S,Miss
1,1,1,,-1.0,0,0,240.0,-120,,Miss


## Score test data

In [414]:
model

Pipeline(steps=[('categorical_imputation',
                 CategoricalImputer(fill_value='missing',
                                    variables=['sex', 'cabin', 'embarked',
                                               'title'])),
                ('missing_indicator',
                 AddMissingIndicator(missing_only=False,
                                     variables=['pclass', 'age', 'sibsp',
                                                'parch', 'fare'])),
                ('median_imputation',
                 MeanMedianImputer(variables=['pclass', 'age', 'sibsp', 'parch',
                                              'fare'])),
                ('extract_letter',
                 ExtractLetterTransformer(variables=['cabin'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=1,
                                  variables=['sex', 'cabin', 'embarked',
                                             'title'])),
                ('categorical_enc

In [415]:
X_test = data[[var for var in data.columns if var != "survived"]]
y_test = data["survived"]

In [416]:
X_test, y_test

(   pclass  sex   age  sibsp  parch      fare cabin embarked title
 0       1    x  29.0      0      0  211.3375    B5        S  Miss
 1       1  NaN  -1.0      0      0  240.0000  -120      NaN  Miss,
 0    1
 1    1
 Name: survived, dtype: int64)

In [417]:
# make predictions for test set
class_ = model.predict(X_test)
pred = model.predict_proba(X_test)[:,1]

# determine mse and rmse
#print('test roc-auc: {}'.format(roc_auc_score(y_test, pred)))
print('test accuracy: {}'.format(accuracy_score(y_test, class_)))
print()

test accuracy: 1.0



# Test model

In [418]:
model.predict(X_test)

array([1, 1], dtype=int64)

In [427]:
titanic_pipe[:-3]

Pipeline(steps=[('categorical_imputation',
                 CategoricalImputer(fill_value='missing',
                                    variables=['sex', 'cabin', 'embarked',
                                               'title'])),
                ('missing_indicator',
                 AddMissingIndicator(missing_only=False,
                                     variables=['pclass', 'age', 'sibsp',
                                                'parch', 'fare'])),
                ('median_imputation',
                 MeanMedianImputer(variables=['pclass', 'age', 'sibsp', 'parch',
                                              'fare'])),
                ('extract_letter',
                 ExtractLetterTransformer(variables=['cabin'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=1,
                                  variables=['sex', 'cabin', 'embarked',
                                             'title']))])

In [426]:
titanic_pipe[:-3].transform(X_test)

Unnamed: 0,pclass,age,sibsp,parch,fare,pclass_na,age_na,sibsp_na,parch_na,fare_na,sex_male,cabin_m,cabin_Rare,embarked_S,embarked_C,embarked_Q,title_Mr,title_Miss,title_Mrs
0,1,29.0,0,0,211.3375,0,0,0,0,0,0,0,1,1,0,0,0,1,0
1,1,-1.0,0,0,240.0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
