In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [None]:
DATASETS_DIR = 'datasets/'
URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
DROP_COLS = ['boat','body','home.dest','ticket','name']
RETRIEVED_DATA = 'raw-data.csv'


SEED_SPLIT = 404
TRAIN_DATA_FILE = DATASETS_DIR + 'train.csv'
TEST_DATA_FILE  = DATASETS_DIR + 'test.csv'


TARGET = 'survived'
FEATURES = ['pclass','sex','age','sibsp','parch','fare','cabin','embarked','title']
NUMERICAL_VARS = ['pclass','age','sibsp','parch','fare']
CATEGORICAL_VARS = ['sex','cabin','embarked','title']


NUMERICAL_VARS_WITH_NA = ['age','fare']
CATEGORICAL_VARS_WITH_NA = ['cabin','embarked']
NUMERICAL_NA_NOT_ALLOWED = [var for var in NUMERICAL_VARS if var not in NUMERICAL_VARS_WITH_NA]
CATEGORICAL_NA_NOT_ALLOWED = [var for var in CATEGORICAL_VARS if var not in CATEGORICAL_VARS_WITH_NA]


SEED_MODEL = 404

In [None]:
def data_retrieval(url):

    # Loading data from specific url
    data = pd.read_csv(url)

    # Uncovering missing data
    data.replace('?', np.nan, inplace=True)
    data['age'] = data['age'].astype('float')
    data['fare'] = data['fare'].astype('float')

    # helper function 1
    def get_first_cabin(row):
        try:
            return row.split()[0]
        except:
            return np.nan

    # helper function 2
    def get_title(passenger):
        line = passenger
        if re.search('Mrs', line):
            return 'Mrs'
        elif re.search('Mr', line):
            return 'Mr'
        elif re.search('Miss', line):
            return 'Miss'
        elif re.search('Master', line):
            return 'Master'
        else:
            return 'Other'

    # Keep only one cabin | Extract the title from 'name'
    data['cabin'] = data['cabin'].apply(get_first_cabin)
    data['title'] = data['name'].apply(get_title)

    # Droping irrelevant columns
    data.drop(DROP_COLS, 1, inplace=True)

    data.to_csv(DATASETS_DIR + RETRIEVED_DATA, index=False)

    return print('Data stored in {}'.format(DATASETS_DIR + RETRIEVED_DATA))

In [None]:
class MissingIndicator(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var+'_nan'] = X[var].isnull().astype(int)

        return X


class ExtractLetters(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.variable = 'cabin'

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.variable] = X[self.variable].apply(lambda x: ''.join(re.findall("[a-zA-Z]+", x)) if type(x)==str else x)
        return X


class CategoricalImputer(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].fillna('Missing')
        return X


class NumericalImputer(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.median_dict_ = {}
        for var in self.variables:
            self.median_dict_[var] = X[var].median()
        return self


    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].fillna(self.median_dict_[var])
        return X


class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, tol=0.05, variables=None):
        self.tol = tol
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.rare_labels_dict = {}
        for var in self.variables:
            t = pd.Series(X[var].value_counts() / np.float(X.shape[0]))
            self.rare_labels_dict[var] = list(t[t<self.tol].index)
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = np.where(X[var].isin(self.rare_labels_dict[var]), 'rare', X[var])
        return X


class OneHotEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.dummies = pd.get_dummies(X[self.variables], drop_first=True).columns
        return self

    def transform(self, X):
        X = X.copy()
        X = pd.concat([X, pd.get_dummies(X[self.variables], drop_first=True)], 1)
        X.drop(self.variables, 1, inplace=True)

        # Adding missing dummies, if any
        missing_dummies = [var for var in self.dummies if var not in X.columns]
        if len(missing_dummies) != 0:
            for col in missing_dummies:
                X[col] = 0

        return X


class OrderingFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        self.ordered_features = X.columns
        return self

    def transform(self, X):
        return X[self.ordered_features]


# scaler = MinMaxScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test  = scaler.transform(X_test)

# model = LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL)
# model.fit(X_train, y_train)

In [None]:
titanic_pipeline = Pipeline(
                              [
                                ('missing_indicator', MissingIndicator(variables=NUMERICAL_VARS)),
                                ('cabin_only_letter', ExtractLetters()),
                                ('categorical_imputer', CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)),
                                ('median_imputation', NumericalImputer(variables=NUMERICAL_VARS_WITH_NA)),
                                ('rare_labels', RareLabelCategoricalEncoder(tol=0.05, variables=CATEGORICAL_VARS)),
                                ('dummy_vars', OneHotEncoder(variables=CATEGORICAL_VARS)),
                                ('aligning_feats', OrderingFeatures()),
                                ('scaling', MinMaxScaler()),
                                ('log_reg', LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL))
                              ])

In [None]:
df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)

X_train, X_test, y_train, y_test = train_test_split(
                                                        df.drop(TARGET, axis=1),
                                                        df[TARGET],
                                                        test_size=0.2,
                                                        random_state=404
                                                   )

FileNotFoundError: ignored

In [None]:
titanic_pipeline.fit(X_train, y_train);

In [None]:
class_pred = titanic_pipeline.predict(X_test)
proba_pred = titanic_pipeline.predict_proba(X_test)[:,1]
print('test roc-auc : {}'.format(roc_auc_score(y_test, proba_pred)))
print('test accuracy: {}'.format(accuracy_score(y_test, class_pred)))
print()

test roc-auc : 0.8163583073823043
test accuracy: 0.7748091603053435



## Persisting the trained model

In [None]:
import joblib

TRAINED_MODEL_DIR = 'trained_models/'
PIPELINE_NAME = 'logistic_regression'
PIPELINE_SAVE_FILE = f'{PIPELINE_NAME}_output.pkl'

save_file_name = f'{PIPELINE_SAVE_FILE}'
save_path = TRAINED_MODEL_DIR + save_file_name

pipeline_to_persist = titanic_pipeline

# joblib.dump(pipeline_to_persist, save_path)

## Predictions

**Basic input validation**

In [None]:
input_data = X_test.copy()

In [None]:
validated_data = input_data

if input_data[NUMERICAL_NA_NOT_ALLOWED].isnull().any().any():
        validated_data = validated_data.dropna(subset=NUMERICAL_NA_NOT_ALLOWED)

if input_data[CATEGORICAL_NA_NOT_ALLOWED].isnull().any().any():
        validated_data = validated_data.dropna(subset=CATEGORICAL_NA_NOT_ALLOWED)

**Making predictions**

In [None]:
file_path = TRAINED_MODEL_DIR + PIPELINE_SAVE_FILE
trained_model = joblib.load(filename=file_path)

preds = trained_model.predict(validated_data)
proba = trained_model.predict_proba(validated_data)

In [None]:
pd.concat([validated_data.reset_index(), pd.Series(preds, name='preds'), pd.Series(pd.DataFrame(proba)[1], name='probas')], 1).head()

Unnamed: 0,index,pclass,sex,age,sibsp,parch,fare,cabin,embarked,title,preds,probas
0,215,1,male,58.0,0,2,113.275,D48,C,Mr,1,0.502177
1,378,2,male,31.0,1,1,26.25,,S,Mr,0,0.481497
2,695,3,female,18.0,0,0,7.8792,,Q,Miss,1,0.513358
3,414,2,male,34.0,1,0,21.0,,S,Mr,0,0.481422
4,509,2,male,39.0,0,0,26.0,,S,Mr,0,0.481452


In [None]:
# preds == class_pred

## Predictions with the model served as REST API

In [None]:
import json
import requests

url = 'http://127.0.0.1:5000/v1/predict/classification'
headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}

data1 = {"pclass":1,"sex":"male","age":58.0,"sibsp":0,"parch":2,"fare":113.275,"cabin":"D48","embarked":"C","title":"Mr"}
data2 = {"pclass":2,"sex":"male","age":31.0,"sibsp":1,"parch":1,"fare":26.25,"cabin":None,"embarked":"S","title":"Mr"}
data3 = {"pclass":3,"sex":"female","age":18.0,"sibsp":0,"parch":0,"fare":7.8792,"cabin":None,"embarked":"Q","title":"Miss"}
data4 = {"pclass":2,"sex":"male","age":34.0,"sibsp":1,"parch":0,"fare":21.0,"cabin":None,"embarked":"S","title":"Mr"}
data5 = {"pclass":2,"sex":"male","age":39.0,"sibsp":0,"parch":0,"fare":26.0,"cabin":None,"embarked":"S","title":"Mr"}

for d in [X_test[i:i+1].to_json(orient='records') for i in range(25)]:
    info = json.loads(d)[0]
    x = requests.post(url, data=json.dumps(info), headers=headers)
    print(x.json())

{'errors': None, 'predictions': 1, 'probas': 0.5310253087423779, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5263773680278067, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5219452589901723, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5263026785829125, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5263332008249159, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5219136288521562, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5264846114230751, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5307701281686432, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5309518672521598, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5315805668982763, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5278059761556281, 'version': '1.0.0'}
{'errors': None, 'predictions': 1, 'probas': 0.5188765127026285, 

## pd.DataFrame.from_dict([x.json()], orient='columns')

___

In [None]:
# X_test.head(5)

In [None]:
prueba = '{"pclass":1,"sex":"male","age":58.0,"sibsp":0,"parch":2,"fare":113.275,"cabin":"D48","embarked":"C","title":"Mr"}'
type(json.loads(prueba) )

dict

In [None]:
for s,t in zip(['train','test'],[(X_train, y_train),(X_test,y_test)]):
    x,y = t[0], t[1]
    class_pred = model.predict(x)
    proba_pred = model.predict_proba(x)[:,1]
    print('{} roc-auc : {}'.format(s, roc_auc_score(y, proba_pred)))
    print('{} accuracy: {}'.format(s, accuracy_score(y, class_pred)))
    print()

train roc-auc : 0.8470412710714978
train accuracy: 0.7831900668576887

test roc-auc : 0.8163583073823043
test accuracy: 0.7748091603053435



In [None]:
tmp = pd.DataFrame(X_test, columns=list(sort_feats.ordered_features))
tmp['y_true'] = np.array(y_test)
tmp['y_pred'] = model.predict(X_test)
tmp['proba_pred'] = model.predict_proba(X_test)[:,1]

tmp.head(10)

Unnamed: 0,pclass,age,sibsp,parch,fare,pclass_nan,age_nan,sibsp_nan,parch_nan,fare_nan,...,cabin_rare,embarked_Q,embarked_S,embarked_rare,title_Mr,title_Mrs,title_rare,y_true,y_pred,proba_pred
0,0.0,0.724426,0.0,0.222222,0.221098,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0.502177
1,0.5,0.386221,0.125,0.111111,0.051237,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.481497
2,1.0,0.223382,0.0,0.0,0.015379,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0.513358
3,0.5,0.423799,0.125,0.0,0.040989,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.481422
4,0.5,0.48643,0.0,0.0,0.050749,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.481452
5,1.0,0.298538,0.0,0.0,0.01394,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.47703
6,0.5,0.160751,0.0,0.111111,0.038061,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1,0.514231
7,0.0,0.611691,0.125,0.0,0.111118,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0.501921
8,0.0,0.398747,0.0,0.0,0.148911,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.534687
9,0.0,0.26096,0.25,0.222222,0.512122,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.531581
