# Table of content

- [Titanic](#Titanic)
  - [Setup](#Setup)
  - [Data](#Data)
    - [Download](#Download)
    - [Split Data](#Split-Data)
  - [Advanced Models' Common Functions](#Advanced-Models'-Common-Functions)
  - [Custom Transformers](#Custom-Transformers)
  - [Pipelines](#Pipelines)
  - [Logistic Regression : 0.76555](#Logistic-Regression-:-0.76555)

# Titanic

This notebook has been inspired from the book [*Handson-Machine Learning with Scikit-learn, Tensorflow and Keras*](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/). 

Thanks to the author, [Aurélien Géron](https://github.com/ageron).

## Setup

What does the environment require?

In [1]:
# Python ≥3.5 is required
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import sklearn

assert sklearn.__version__ >= '0.20'
assert sys.version_info >= (3, 5)

np.random.seed(42)

## Data

### Download

In [2]:
def load_titanic_dataset(filename, path='titanic_dataset'):
    csv_path = Path.joinpath(Path(path), filename)
    return pd.read_csv(csv_path)


data = load_titanic_dataset('train.csv')
submit = load_titanic_dataset('test.csv')
gender_submission = load_titanic_dataset('gender_submission.csv')

### Split Data

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data,
                               test_size=0.2,
                               random_state=42,
                               stratify=data['Sex'])

## Advanced Models' Common Functions

Once again, some functions for [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) code. Don't mind them, come back if you need to.

In [4]:
import joblib
from IPython.display import Audio

SOUND_FILE_NAME = './no_sound.wav'
USE_SOUND_FILE = False


def ring(use_sound_file=USE_SOUND_FILE, sound_file=SOUND_FILE_NAME):
    if use_sound_file:
        return Audio(sound_file, rate=1, autoplay=True)


def print_model_stats(cv_clf):
    param_dict_items = [
        f"\n    '{params[0]}': ['{params[1]}']" if isinstance(params[1], str)
        else f"\n    '{params[0]}': [{params[1]}]"
        for params in cv_clf.best_params_.items()
    ]
    print('params = {'
          f'{",".join(param_dict_items)}\n'
          '}')
    print(f'CV\'s best accuracy : {cv_clf.best_score_:.5f}')


def save_and_load_model(cv_clf, model_name, x_test_tfm):
    predictions = cv_clf.predict(x_test_tfm)

    submission = pd.DataFrame({
        'PassengerId': submit['PassengerId'],
        'Survived': predictions
    })

    file_name = f'{model_name}.csv'

    output_dir = Path('submissions')
    output_dir.mkdir(parents=True, exist_ok=True)

    submission.to_csv(output_dir.joinpath(file_name), index=False)

    joblib.dump(cv_clf, output_dir.joinpath(f'{model_name}.pkl'))
    return joblib.load(output_dir.joinpath(f'{model_name}.pkl'))

## Custom Transformers

In [5]:
from sklearn.base import TransformerMixin, BaseEstimator


class tfm_example(TransformerMixin, BaseEstimator):
    def __init__(self, do_tfm=False):
        self.do_tfm = do_tfm

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if self.do_tfm:
            return X + X
        return X

## Pipelines

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

x_att = ['Sex']
y_att = ['Survived']

x_train = train[x_att]
y_train = train[y_att]
x_test = test[x_att]
y_test = test[y_att]
x_submit = submit[x_att]

categorical_tfm = Pipeline([('one_hot', OneHotEncoder())])

sex_idx = 0
pipeline = ColumnTransformer([('cat', categorical_tfm, [sex_idx])],
                             remainder='drop')

## Logistic Regression : 0.76555


In [7]:
from sklearn.linear_model import LogisticRegression

log_pipeline = Pipeline([('pipe', pipeline),
                         ('log', LogisticRegression(random_state=42))])

print(log_pipeline.get_params().keys())

dict_keys(['memory', 'steps', 'verbose', 'pipe', 'log', 'pipe__n_jobs', 'pipe__remainder', 'pipe__sparse_threshold', 'pipe__transformer_weights', 'pipe__transformers', 'pipe__verbose', 'pipe__cat', 'pipe__cat__memory', 'pipe__cat__steps', 'pipe__cat__verbose', 'pipe__cat__one_hot', 'pipe__cat__one_hot__categories', 'pipe__cat__one_hot__drop', 'pipe__cat__one_hot__dtype', 'pipe__cat__one_hot__handle_unknown', 'pipe__cat__one_hot__sparse', 'log__C', 'log__class_weight', 'log__dual', 'log__fit_intercept', 'log__intercept_scaling', 'log__l1_ratio', 'log__max_iter', 'log__multi_class', 'log__n_jobs', 'log__penalty', 'log__random_state', 'log__solver', 'log__tol', 'log__verbose', 'log__warm_start'])


In [8]:
from sklearn.model_selection import GridSearchCV

params = {
    'log__C': [1],
    'log__dual': [True],
    'log__fit_intercept': [True],
    'log__max_iter': [10000],
    'log__penalty': ['l2'],
    'log__solver': ['liblinear']
}
# params = {
#     'log__C': [1, 10],
#     'log__dual': [True, False],
#     'log__fit_intercept': [True, False],
#     'log__max_iter': [10**4],
#     'log__penalty': ['l1', 'l2', 'elasticnet', 'none'],
#     'log__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
# }
cv_log = GridSearchCV(log_pipeline, params, verbose=2, scoring='accuracy')
cv_log.fit(x_train, y_train.values.ravel())
ring()

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000, log__penalty=l2, log__solver=liblinear 
[CV]  log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000, log__penalty=l2, log__solver=liblinear, total=   0.0s
[CV] log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000, log__penalty=l2, log__solver=liblinear 
[CV]  log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000, log__penalty=l2, log__solver=liblinear, total=   0.0s
[CV] log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000, log__penalty=l2, log__solver=liblinear 
[CV]  log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000, log__penalty=l2, log__solver=liblinear, total=   0.0s
[CV] log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000, log__penalty=l2, log__solver=liblinear 
[CV]  log__C=1, log__dual=True, log__fit_intercept=True, log__max_iter=10000,

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [9]:
print_model_stats(cv_log)
model_name = 'log_000'
joblib_log = save_and_load_model(cv_log, model_name, x_submit)
score = round(joblib_log.score(x_test, y_test), 5)
print(f'Test set\'s score : {score:.5f}')

params = {
    'log__C': [1],
    'log__dual': [True],
    'log__fit_intercept': [True],
    'log__max_iter': [10000],
    'log__penalty': ['l2'],
    'log__solver': ['liblinear']
}
CV's best accuracy : 0.78928
Test set's score : 0.77654
