In [1]:
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from category_encoders import HashingEncoder, TargetEncoder
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
from prettytable import PrettyTable
import time
import warnings
warnings.filterwarnings(
    "ignore",
    message="This Pipeline instance is not fitted yet.*",
    category=FutureWarning
)

In [2]:
dataset_path = 'E:/Datasets/titanic/wrangled dataset'

In [3]:
train_w = pd.read_csv(f'{dataset_path}/train.csv')
test_w = pd.read_csv(f'{dataset_path}/test.csv')

In [4]:
print(train_w.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family', 'Title',
       'Deck', 'TicketPrefix'],
      dtype='object')


In [5]:
X = train_w.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1)
#X_test = test_w.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_w['Survived']

In [6]:
models = {}

In [7]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family,Title,Deck,TicketPrefix
0,3,0,22.0,1,0,7.25,S,1,Mr,u,A/
1,1,1,38.0,1,0,71.2833,C,1,Mrs,C,PC
2,3,1,26.0,0,0,7.925,S,0,Miss,u,STON/O
3,1,1,35.0,1,0,53.1,S,1,Mrs,C,NONE
4,3,0,35.0,0,0,8.05,S,0,Mr,u,NONE


# Pipelines

## Linear Models

### LogisticRegression

In [8]:
from sklearn.linear_model import LogisticRegression

hyperparams = {
    'n_features': 8,
    'quantile_range': (25.0, 75.0),
    'C' : 1,
    'penalty' : 'l2',
    'solver' : 'lbfgs',
    'l1_ratio' : None,
    'class_weight' : None
}

def logistic_regression_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler(quantile_range=params['quantile_range'])
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = LogisticRegression(
        C=params['C'],
        penalty=params['penalty'],
        solver=params['solver'],
        l1_ratio=params['l1_ratio'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])    

In [9]:
#Divide the search space to avoid inconsistencies
param_search_space = [
    #Liblinear only supports 'l1' and 'l2' penalties
    {
        'preproc__hash__hasher__n_features' : randint(4, 65),
        'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
        'model__C': loguniform(1e-4, 1e4),
        'model__penalty': ['l1'],
        'model__solver': ['liblinear'],
        'model__l1_ratio': uniform(0, 1),
        'model__class_weight': [None, 'balanced']
    }, 
    {
        'preproc__hash__hasher__n_features' : randint(4, 65),
        'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
        'model__C': loguniform(1e-4, 1e4),
        'model__penalty': ['l2'],
        'model__solver': ['liblinear'],
        'model__class_weight': [None, 'balanced']
    },    
    #The rest of solvers doesn't support 'l1'
    {
        'preproc__hash__hasher__n_features' : randint(4, 65),
        'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
        'model__C': loguniform(1e-4, 1e4),
        'model__penalty': ['l2'],
        'model__solver': ['lbfgs', 'newton-cg', 'newton-cholesky'],
        'model__class_weight': [None, 'balanced']
    }
]

In [10]:
models['LogisticRegression'] = {}
models['LogisticRegression']['pipeline'] = logistic_regression_pipeline
models['LogisticRegression']['hyperparams'] = hyperparams
models['LogisticRegression']['param_search'] = param_search_space

### RidgeClassifier 

In [11]:
from sklearn.linear_model import RidgeClassifier

hyperparams = {
    'n_features': 8,
    'quantile_range': (25.0, 75.0),
    'alpha' : 1,
    'solver' : 'auto',
    'fit_intercept' : True,
    'class_weight' : None,
}

def ridge_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler(quantile_range=params['quantile_range'])
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = RidgeClassifier(
        alpha=params['alpha'],
        solver=params['solver'],
        fit_intercept=params['fit_intercept'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])    

In [12]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__alpha': loguniform(1e-4, 1e4), 
    'model__solver': ['auto', 'lsqr', 'sparse_cg', 'sag'],
    'model__fit_intercept': [True, False],
    'model__class_weight': [None, 'balanced']
}

In [13]:
models['RidgeClassifier'] = {}
models['RidgeClassifier']['pipeline'] = ridge_classifier_pipeline
models['RidgeClassifier']['hyperparams'] = hyperparams
models['RidgeClassifier']['param_search'] = param_search_space

### PassiveAggressiveClassifier

In [14]:
from sklearn.linear_model import PassiveAggressiveClassifier

hyperparams = {
    'n_features': 8,
    'quantile_range': (25.0, 75.0),
    'C' : 1,
    'fit_intercept' : True,
    'loss' : 'hinge',
    'average' : False,
    'class_weight' : None
}

def passive_aggressive_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler(quantile_range=params['quantile_range'])
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = PassiveAggressiveClassifier(
        C=params['C'],
        fit_intercept=params['fit_intercept'],
        loss=params['loss'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])  

In [15]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__C': loguniform(1e-4, 1e4),
    'model__fit_intercept': [True, False],
    'model__loss': ['hinge', 'squared_hinge'],
    'model__class_weight': [None, 'balanced'],
    'model__average': [True, False]
}

In [16]:
models['PassiveAggressiveClassifier'] = {}
models['PassiveAggressiveClassifier']['pipeline'] = passive_aggressive_classifier_pipeline
models['PassiveAggressiveClassifier']['hyperparams'] = hyperparams
models['PassiveAggressiveClassifier']['param_search'] = param_search_space

### SGDClassifier 

In [18]:
from sklearn.linear_model import SGDClassifier

hyperparams = {
    'n_features': 8,
    'quantile_range': (25.0, 75.0),
    'loss' : 'hinge',
    'penalty' : 'l2',
    'alpha' : 0.0001,
    'l1_ratio' : 0.15,
    'learning_rate' : 'optimal',
    'eta0' : 0,
    'power_t' : 0.5,
    'average' : False,
    'class_weight' : None,
}

def sgd_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler(quantile_range=params['quantile_range'])
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = SGDClassifier(
        loss=params['loss'],
        penalty=params['penalty'],
        alpha=params['alpha'],
        l1_ratio=params['l1_ratio'],
        learning_rate=params['learning_rate'],
        eta0=params['eta0'],
        power_t=params['power_t'],
        average=params['average'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])  

In [19]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__loss':            ['hinge', 'log_loss', 'modified_huber', 'squared_hinge'], 
    'model__penalty':         ['l2', 'l1', 'elasticnet'], 
    'model__alpha':           loguniform(1e-6, 1e-1), 
    'model__l1_ratio':        uniform(0.0, 1.0), 
    'model__learning_rate':   ['optimal', 'invscaling', 'adaptive'], 
    'model__eta0':            loguniform(1e-4, 1e-1), 
    'model__power_t':         uniform(0.1, 0.9), 
    'model__class_weight':    [None, 'balanced'], 
    'model__average':         [True, False],
}

In [20]:
models['SGDClassifier'] = {}
models['SGDClassifier']['pipeline'] = sgd_classifier_pipeline
models['SGDClassifier']['hyperparams'] = hyperparams
models['SGDClassifier']['param_search'] = param_search_space

### Perceptron

In [82]:
from sklearn.linear_model import Perceptron

hyperparams = {
    'n_features': 8,
    'quantile_range': (25.0, 75.0),
    'penalty' : None,
    'alpha' : 0.0001,
    'l1_ratio' : 0.15,
    'fit_intercept' : True,
    'eta0' : 1.0,
    'shuffle' : True,
    'class_weight' : None,
}

def perceptron_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler(quantile_range=params['quantile_range'])
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = Perceptron(
        penalty=params['penalty'],
        alpha=params['alpha'],
        l1_ratio=params['l1_ratio'],
        fit_intercept=params['fit_intercept'],
        eta0=params['eta0'],
        shuffle=params['shuffle'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])  

In [83]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__penalty': [None, 'l2', 'l1', 'elasticnet'], 
    'model__alpha': loguniform(1e-6, 1e-1), 
    'model__l1_ratio': uniform(0.0, 1.0), 
    'model__fit_intercept': [True, False],
    'model__eta0': loguniform(1e-4, 1e-1), 
    'model__shuffle': [True, False],
    'model__class_weight': [None, 'balanced']
}

In [84]:
models['Perceptron'] = {}
models['Perceptron']['pipeline'] = perceptron_pipeline
models['Perceptron']['hyperparams'] = hyperparams
models['Perceptron']['param_search'] = param_search_space

## Nearest Neighbors

### KNeighborsClassifier

In [47]:
from sklearn.neighbors import KNeighborsClassifier

hyperparams = {
    'n_features': 8,
    'quantile_range': (25.0, 75.0),
    'n_neighbors' : 5,
    'weights' : 'uniform',
    'p' : 2,
    'leaf_size' : 30,
}

def knn_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler(quantile_range=params['quantile_range'])
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = KNeighborsClassifier(
        n_neighbors=params['n_neighbors'],
        weights=params['weights'],
        p=params['p'],
        leaf_size=params['leaf_size']
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])  

In [48]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__n_neighbors': randint(3, 51),
    'model__weights': ['uniform', 'distance'], 
    'model__p': [1, 2], 
    'model__leaf_size': randint(10, 61), 
}

In [49]:
models['KNeighborsClassifier'] = {}
models['KNeighborsClassifier']['pipeline'] = knn_classifier_pipeline
models['KNeighborsClassifier']['hyperparams'] = hyperparams
models['KNeighborsClassifier']['param_search'] = param_search_space

## Tree-based models

### DecisionTreeClassifier

In [16]:
from sklearn.tree import DecisionTreeClassifier
hyperparams = {
    'n_features': 8,
    'criterion' : 'gini',
    'max_depth' : None,
    'min_samples_split' : 2,
    'min_samples_leaf' : 1,
    'max_features' : None,
    'class_weight' : None,
    'ccp_alpha' : 0
}

def decision_tree_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') #Replaces Hashing encoder
    #hashing_tf = HashingEncoder(n_components=params['n_components'], cols=hash_cols)
    
    model = DecisionTreeClassifier(
        criterion=params['criterion'], 
        max_depth=params['max_depth'], 
        min_samples_split=params['min_samples_split'], 
        min_samples_leaf=params['min_samples_leaf'], 
        max_features=params['max_features'], 
        class_weight=params['class_weight'],
        ccp_alpha=params['ccp_alpha'],
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [13]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__criterion': ['gini', 'entropy', 'log_loss'],
    'model__max_depth': [None] + list(range(1, 21)),
    'model__min_samples_split': randint(2, 21),
    'model__min_samples_leaf': randint(1, 21),
    'model__max_features': [None, 'sqrt', 'log2'],
    'model__class_weight': [None, 'balanced'],
    'model__ccp_alpha': loguniform(1e-4, 1e-1)
}

In [17]:
models['DecisionTreeClassifier'] = {}
models['DecisionTreeClassifier']['pipeline'] = decision_tree_pipeline
models['DecisionTreeClassifier']['hyperparams'] = hyperparams
models['DecisionTreeClassifier']['param_search'] = param_search_space

### RandomForestClassifier

This model is not affected too much by scaling, so I am going to leave untouch the numerical values.

In [22]:
from sklearn.ensemble import RandomForestClassifier
hyperparams = {
    'n_features': 8,
    'n_estimators' : 100,
    'criterion' : 'gini',
    'max_depth' : None,
    'min_samples_split' : 2,
    'min_samples_leaf' : 5,
    'max_features' : 'sqrt',
    'bootstrap' : True,
    'class_weight' : None,
    'ccp_alpha' : 0
}

def random_forest_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') #Replaces Hashing encoder
    #hashing_tf = HashingEncoder(n_components=params['n_components'], cols=hash_cols)
    
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'], 
        criterion=params['criterion'], 
        max_depth=params['max_depth'], 
        min_samples_split=params['min_samples_split'], 
        min_samples_leaf=params['min_samples_leaf'], 
        max_features=params['max_features'], 
        bootstrap=params['bootstrap'], 
        class_weight=params['class_weight'],
        ccp_alpha=params['ccp_alpha'],
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [23]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__n_estimators': randint(100, 501),
    'model__criterion': ['gini', 'entropy', 'log_loss'],
    'model__max_depth': [None] + list(range(5, 51, 5)),
    'model__min_samples_split': randint(2, 21),
    'model__min_samples_leaf': randint(1, 11),
    'model__max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5],
    'model__bootstrap': [True, False],
    'model__class_weight': [None, 'balanced', 'balanced_subsample'],
    'model__ccp_alpha': uniform(0.0, 0.01)
}


In [None]:
models['RandomForestClassifier'] = {}
models['RandomForestClassifier']['pipeline'] = random_forest_pipeline
models['RandomForestClassifier']['hyperparams'] = hyperparams
models['RandomForestClassifier']['param_search'] = param_search_space

### ExtraTreesClassifier

In [23]:
from sklearn.ensemble import ExtraTreesClassifier
hyperparams = {
    'n_features': 8,
    'n_estimators' : 100,
    'criterion' : 'gini',
    'max_depth' : None,
    'min_samples_split' : 2,
    'min_samples_leaf' : 5,
    'max_features' : 'sqrt',
    'bootstrap' : False,
    'class_weight' : None,
    'ccp_alpha' : 0
}

def extra_trees_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') #Replaces Hashing encoder
    #hashing_tf = HashingEncoder(n_components=params['n_components'], cols=hash_cols)
    
    model = ExtraTreesClassifier(
        n_estimators=params['n_estimators'], 
        criterion=params['criterion'], 
        max_depth=params['max_depth'], 
        min_samples_split=params['min_samples_split'], 
        min_samples_leaf=params['min_samples_leaf'], 
        max_features=params['max_features'], 
        bootstrap=params['bootstrap'], 
        class_weight=params['class_weight'],
        ccp_alpha=params['ccp_alpha'],
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [27]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__n_estimators': randint(100, 501),
    'model__criterion': ['gini', 'entropy', 'log_loss'],
    'model__max_depth': [None] + list(range(1, 31)),
    'model__min_samples_split': randint(2, 21),
    'model__min_samples_leaf': randint(1, 21),
    'model__max_features': ['sqrt', 'log2', 0.2, 0.5],
    'model__bootstrap': [True, False],
    'model__class_weight': [None, 'balanced', 'balanced_subsample'],
    'model__ccp_alpha': uniform(0.0, 0.01)
}

In [25]:
models['ExtraTreesClassifier'] = {}
models['ExtraTreesClassifier']['pipeline'] = extra_trees_pipeline
models['ExtraTreesClassifier']['hyperparams'] = hyperparams
models['ExtraTreesClassifier']['param_search'] = param_search_space

## Boosting

### GradientBoostingClassifier

In [37]:
from sklearn.ensemble import GradientBoostingClassifier
hyperparams = {
    'n_features': 8,
    'loss': 'log_loss',
    'criterion' : 'gini',
    'learning_rate': 0.1,
    'n_estimators' : 100,
    'subsample' : 1,
    'max_depth' : None,
    'min_samples_split' : 2,
    'min_samples_leaf' : 5,
    'max_features' : 'sqrt'
}

def gradient_boosting_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') #Replaces Hashing encoder
    #hashing_tf = HashingEncoder(n_components=params['n_components'], cols=hash_cols)
    
    model = GradientBoostingClassifier(
        n_estimators=params['n_estimators'], 
        loss=params['loss'], 
        criterion=params['criterion'], 
        learning_rate=params['learning_rate'], 
        subsample=params['subsample'], 
        max_depth=params['max_depth'], 
        min_samples_split=params['min_samples_split'], 
        min_samples_leaf=params['min_samples_leaf'], 
        max_features=params['max_features'],
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [38]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__loss': ['log_loss', 'exponential'],
    'model__criterion': ['friedman_mse', 'squared_error'],
    'model__learning_rate': loguniform(1e-3, 1e-1),
    'model__n_estimators': randint(100, 1000),
    'model__subsample': uniform(0.5, 0.5),
    'model__max_depth': [None] + list(range(3, 11)),
    'model__min_samples_split': randint(2, 20),
    'model__min_samples_leaf': randint(1, 20),
    'model__max_features': ['sqrt', 'log2', None]
}

In [39]:
models['GradientBoostingClassifier'] = {}
models['GradientBoostingClassifier']['pipeline'] = gradient_boosting_pipeline
models['GradientBoostingClassifier']['hyperparams'] = hyperparams
models['GradientBoostingClassifier']['param_search'] = param_search_space

### HistGradientBoostingClassifier

In [42]:
for key in param_search_space.keys():
    if 'model__' in key:
        print(key.replace('model__', ''))

learning_rate
l2_regularization
max_iter
max_leaf_nodes
max_depth
min_samples_split
min_samples_leaf
max_features


In [61]:
from sklearn.ensemble import HistGradientBoostingClassifier
hyperparams = {
    'n_features': 8,
    'learning_rate': 1,
    'l2_regularization' : 0,
    'max_iter' : 100,
    'max_depth' : None,
    'min_samples_leaf' : 20,
    'max_features' : 1
}

def hist_gradient_boosting_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')

    def to_dense(X): # FeatureHasher gets sparse matrices by default
        return X.toarray() if hasattr(X, "toarray") else X
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False) #This avoids sparce matrices when encoding
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') 
    
    model = HistGradientBoostingClassifier(
        learning_rate=params['learning_rate'], 
        l2_regularization=params['l2_regularization'], 
        max_iter=params['max_iter'], 
        max_depth=params['max_depth'], 
        min_samples_leaf=params['min_samples_leaf'], 
        max_features=params['max_features'],
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([
            ('to_dict', dict_tf), 
            ('hasher', hasher), 
            ('dense', FunctionTransformer(to_dense, validate=False))  
        ]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [62]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__learning_rate': loguniform(1e-3, 1e-1),
    'model__l2_regularization' : loguniform(1e-4, 1e1),
    'model__max_iter' : randint(100, 1000),
    'model__max_leaf_nodes' : randint(10, 100),
    'model__max_depth': [None] + list(range(3, 11)),
    'model__min_samples_leaf': randint(1, 20),
    'model__max_features' : uniform(0, 1),
}

In [63]:
models['HistGradientBoostingClassifier'] = {}
models['HistGradientBoostingClassifier']['pipeline'] = hist_gradient_boosting_pipeline
models['HistGradientBoostingClassifier']['hyperparams'] = hyperparams
models['HistGradientBoostingClassifier']['param_search'] = param_search_space

### XGBClassifier

In [76]:
from xgboost import XGBClassifier
hyperparams = {
    'n_features': 8,
    'n_estimators': 100,
    'learning_rate': 0.3,
    'subsample': 1.0, 
    'colsample_bytree': 1.0, 
    'max_depth': 6, 
    'gamma': 0.0, 
    'reg_alpha': 0.0, 
    'reg_lambda': 1.0 
}

def xgb_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')

    def to_dense(X): # FeatureHasher gets sparse matrices by default
        return X.toarray() if hasattr(X, "toarray") else X
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False) #This avoids sparce matrices when encoding
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') 
    
    model = XGBClassifier(
        n_estimators=params['n_estimators'], 
        learning_rate=params['learning_rate'], 
        subsample=params['subsample'], 
        colsample_bytree=params['colsample_bytree'], 
        max_depth=params['max_depth'], 
        gamma=params['gamma'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([
            ('to_dict', dict_tf), 
            ('hasher', hasher), 
            ('dense', FunctionTransformer(to_dense, validate=False))  
        ]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [77]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__learning_rate': loguniform(1e-3, 0.3),
    'model__n_estimators' : randint(100, 1000),
    'model__max_depth': [None] + list(range(3, 10)),
    'model__subsample' : uniform(0.5, 0.5),  # rango [0.5, 1.0]
    'model__colsample_bytree' : uniform(0.5, 0.5),  # rango [0.5, 1.0]
    'model__gamma' : loguniform(1e-8, 1.0),
    'model__reg_alpha' : loguniform(1e-8, 10.0),
    'model__reg_lambda' : loguniform(1e-8, 10.0),
}

In [78]:
models['XGBClassifier'] = {}
models['XGBClassifier']['pipeline'] = xgb_pipeline
models['XGBClassifier']['hyperparams'] = hyperparams
models['XGBClassifier']['param_search'] = param_search_space

## Bayes

In [90]:
#Priors for Titanic
#Empiric count
n_total = len(y)
n_survived = (y == 1).sum()
n_died     = (y == 0).sum()
#Smoothening parameters
alpha = 1.0
n_classes = 2
#P(Clase) = (n_clase + α) / (n_total + α * n_clases), we avoid that any rare class falls to cero by adding 1. This is not the case, but it is a good practice
priors_smoothed = [
    (n_died     + alpha) / (n_total + alpha * n_classes),  #P(Y=0)
    (n_survived + alpha) / (n_total + alpha * n_classes)   #P(Y=1)
]

### GaussianNB 

In [None]:
for key in param_search_space.keys():
    if 'model__' in key:
        print(key.replace('model__', ''))

In [99]:
from sklearn.naive_bayes import GaussianNB

hyperparams = {
    'n_features': 8,
    'quantile_range': (25.0, 75.0),
    'priors' : None,
    'var_smoothing' : 1e-09
}

def gnb_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')

    def to_dense(X): # FeatureHasher gets sparse matrices by default
        return X.toarray() if hasattr(X, "toarray") else X
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler(quantile_range=params['quantile_range'])
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = GaussianNB(
        priors=params['priors'],
        var_smoothing=params['var_smoothing']
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([
            ('to_dict', dict_tf), 
            ('hasher', hasher), 
            ('dense', FunctionTransformer(to_dense, validate=False))  
        ]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])  

In [100]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__priors': [None, priors_smoothed],
    'model__var_smoothing': loguniform(1e-12, 1e-6)
}

In [101]:
models['GaussianNB'] = {}
models['GaussianNB']['pipeline'] = gnb_classifier_pipeline
models['GaussianNB']['hyperparams'] = hyperparams
models['GaussianNB']['param_search'] = param_search_space

### CategoricalNB

In [None]:
HyperparameterSearch('GaussianNB', gnb_classifier_pipeline(hyperparams), param_search_space)

### BernoulliNB

# Hyperparameter search and benchmarking

In [8]:
def HyperparameterSearch(name, pipeline, param_distributions, verbose=True):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=107)
    
    rand_search = RandomizedSearchCV(
        estimator=pipeline,
        error_score='raise',
        param_distributions=param_distributions,
        n_iter=100,               
        scoring='accuracy',
        cv=cv,
        verbose=2,
        random_state=107,
        n_jobs=-1
    )
    
    rand_search.fit(X, y)

    if verbose:
        print(f'Best hyperparameters for {name}:\n{rand_search.best_params_}')
        print(f'Best accuracy for {name}: {rand_search.best_score_:.3f}')
    
    return rand_search

In [32]:
table = PrettyTable()
table.field_names = ['Model', 'Training time', 'Accuracy']

for model in models:
    start = time.time()
    pipeline = models[model]['pipeline']
    hyperparams = models[model]['hyperparams']
    param_search_space = models[model]['param_search']
    search_results = HyperparameterSearch(model, 
                                          pipeline(hyperparams), 
                                          param_search_space, 
                                          verbose=False)
    stop = time.time()
    best_model = search_results.best_estimator_
    y_pred = best_model.predict(X)
    table.add_row([model, f'{stop - start:.4f} [s]', f'{accuracy_score(y, y_pred):.4f}'])
    models[model]['best_params'] = search_results.best_params_

print(table)

Fitting 5 folds for each of 100 candidates, totalling 500 fits




Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
+-----------------------------+---------------+----------+
|            Model            | Training time | Accuracy |
+-----------------------------+---------------+----------+
|      LogisticRegression     |   6.9080 [s]  |  0.8373  |
|       RidgeClassifier       |  10.3780 [s]  |  0.8361  |
| PassiveAggressiveClassifier |   5.2185 [s]  |  0.8025  |
|        SGDClassifier        |   5.9230 [s]  |  0.8227  |
+-----------------------------+---------------+----------+


# Cross Validation

In [None]:
model = 'LogisticRegression'
pipeline = models[model]['pipeline']
hyperparams = models[model]['hyperparams']

best_hyperparams = hyperparams.copy()
for hyperparam, value in models[model]['best_params'].items():
    if 'model__' in hyperparam:
        best_hyperparams[hyperparam.replace('model__', '')] = value.item() if type(value) == np.float64 else value
    elif 'preproc__' in hyperparam:
        if 'hash' in hyperparam:
            best_hyperparams['n_features'] = value
        elif 'robust' in hyperparam:
            best_hyperparams['quantile_range'] = value
print(hyperparams)
print(best_hyperparams)

In [56]:
start = time.time()
scores = cross_val_score(
    estimator=pipeline(best_hyperparams),
    X=X, 
    y=y,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
stop = time.time()
seconds_elapsed = stop - start
time_format = str(dt.timedelta(seconds=int(seconds_elapsed)))
    
print(f'Training time: {time_format}')
print(f'Accuracy CV by fold: {scores}')
print(f'Accuracy mean CV: {scores.mean():.4f} ± {scores.std():.4f}')

Training time: 0:00:00
Accuracy CV by fold: [0.82122905 0.83146067 0.79213483 0.80898876 0.87078652]
Accuracy mean CV: 0.8249 ± 0.0264


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=107,
                                                    stratify=y)