## Titanic Pipeline

In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [4]:
data_dir = "../data/"
data_raw = "raw/phpMYEkMl.csv"
data_clean = "clean/titanic.csv"

drop_cols = ["name", "sibsp", "parch", "ticket", "cabin", "boat", "body", "home.dest"]
target = "survived"
features = ["pclass", "sex", "age", "fare", "embarked"]

numerical = ["pclass", "age", "fare"]
categorical = ["sex", "embarked"]
numerical_nan = ["age", "fare"]
cetegorical_nan = ["embarked"]
numerical_new = ["age", "sex", "embarked"]

seed_split = 43
test_size = 0.20
seed_model = 44

In [6]:
def data_load():
    # data load
    df = pd.read_csv(data_dir + data_raw)

    # replace char
    df = df.replace("?", np.nan)

    # drop columns
    df = df.drop(columns=drop_cols)

    return df

In [7]:
class NumericalSex(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.variable = "sex"
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X[self.variable] = np.where(X[self.variable]=="female", 0, 1)
        return X

In [8]:
class NumericalFloat(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].astype("float")
        return X

In [19]:
class NumericalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        self.median_dict = {}
        for var in self.variables:
            self.median_dict[var] = X[var].median()
        return self
        

    def transform(self, X, y=None):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].fillna(self.median_dict[var])
        return X

In [10]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].fillna(method='ffill').fillna(method='bfill')
        return X

In [11]:
class NumericalEmbarked(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.variable = "embarked"
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.variable] = X[self.variable].replace("S", 1)
        X[self.variable] = X[self.variable].replace("C", 2)
        X[self.variable] = X[self.variable].replace("Q", 3)
        return X

In [12]:
class NumericalRound(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = round(X[var],0)
        return X

In [13]:
class NumericalInt(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].astype("int64")
        return X

In [20]:
pipeline = Pipeline(
                    [
                      ("num_sex", NumericalSex()),
                      ("num_float", NumericalFloat(variables=numerical_nan)),
                      ("num_imputer", NumericalImputer(variables=numerical_nan)),
                      ("cat_imputer", CategoricalImputer(variables=cetegorical_nan)),
                      ("num_emb", NumericalEmbarked()),
                      ("num_round", NumericalRound(variables=numerical_nan)),
                      ("num_int", NumericalInt(variables=numerical_new)),
                      ("scaling", MinMaxScaler()),
                      ("log_reg", LogisticRegression(class_weight="balanced", random_state=seed_model))
                    ])

In [21]:
df_raw = data_load()
df_raw.sample(3)

Unnamed: 0,pclass,survived,sex,age,fare,embarked
481,2,1,female,22,41.5792,C
1226,3,0,male,19,7.8958,S
817,3,0,male,22,8.05,S


In [22]:
X = df_raw.drop(target, axis=1)
y = df_raw[target]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed_split)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1047, 5), (1047,), (262, 5), (262,))

In [24]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('num_sex', NumericalSex()),
                ('num_float', NumericalFloat(variables=['age', 'fare'])),
                ('num_imputer', NumericalImputer(variables=['age', 'fare'])),
                ('cat_imputer', CategoricalImputer(variables=['embarked'])),
                ('num_emb', NumericalEmbarked()),
                ('num_round', NumericalRound(variables=['age', 'fare'])),
                ('num_int', NumericalInt(variables=['age', 'sex', 'embarked'])),
                ('scaling', MinMaxScaler()),
                ('log_reg',
                 LogisticRegression(class_weight='balanced', random_state=44))])

In [25]:
class_pred = pipeline.predict(X_test)
proba_pred = pipeline.predict_proba(X_test)[:,1]

In [26]:
print('Test ROC-AUC : {}'.format(roc_auc_score(y_test, proba_pred)))
print('Test Accuracy: {}'.format(accuracy_score(y_test, class_pred)))

Test ROC-AUC : 0.8518135040160643
Test Accuracy: 0.7748091603053435


In [27]:
# persisting trained model
trained_dir = "../models/"
file_name = "titanic_pipiline.pkl"
save_path = trained_dir + file_name

pipeline_persist = pipeline
joblib.dump(pipeline_persist, save_path)

['../models/titanic_pipiline.pkl']

In [28]:
# making predictions
data = X_test.copy()
trained_model = joblib.load(filename=save_path)

preds = trained_model.predict(data)
proba = trained_model.predict_proba(data)

In [33]:
pd.concat([data.reset_index(), pd.Series(preds, name="pred"), pd.Series(pd.DataFrame(proba)[1], name="prob")], 1)

  """Entry point for launching an IPython kernel.


Unnamed: 0,index,pclass,sex,age,fare,embarked,pred,prob
0,1296,3,male,27,8.6625,S,0,0.161408
1,243,1,male,46,79.2,C,0,0.478715
2,145,1,male,48,76.7292,C,0,0.466596
3,36,1,female,22,55,S,1,0.925433
4,1098,3,female,3,21.075,S,1,0.773798
...,...,...,...,...,...,...,...,...
257,606,3,male,20,7.925,S,0,0.184483
258,464,2,male,32,10.5,S,0,0.285728
259,431,2,male,28,33,S,0,0.310362
260,1280,3,male,22,7.8958,S,0,0.177596
