In [107]:
import pandas as pd
import datetime as dp
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from category_encoders import HashingEncoder, TargetEncoder
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
import warnings
warnings.filterwarnings(
    "ignore",
    message="This Pipeline instance is not fitted yet.*",
    category=FutureWarning
)

In [2]:
dataset_path = 'E:/Datasets/titanic/wrangled dataset'

In [3]:
train_w = pd.read_csv(f'{dataset_path}/train.csv')
test_w = pd.read_csv(f'{dataset_path}/test.csv')

In [4]:
train_w.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family', 'Title',
       'Deck', 'TicketPrefix'],
      dtype='object')

In [91]:
X = train_w.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1)
y = train_w['Survived']

In [44]:
models = {}

In [93]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family,Title,Deck,TicketPrefix
0,3,0,22.0,1,0,7.25,S,1,Mr,u,A/
1,1,1,38.0,1,0,71.2833,C,1,Mrs,C,PC
2,3,1,26.0,0,0,7.925,S,0,Miss,u,STON/O
3,1,1,35.0,1,0,53.1,S,1,Mrs,C,NONE
4,3,0,35.0,0,0,8.05,S,0,Mr,u,NONE


# Pipelines

## Linear Models

In [None]:
# TODO PCA

### LogisticRegression

In [94]:
from sklearn.linear_model import LogisticRegression

hyperparams = {
    'n_features': 8,
    'C' : 1,
    'penalty' : 'l2',
    'solver' : 'lbfgs',
    'l1_ratio' : None,
    'class_weight' : None
}

def logistic_regression_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler()
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = LogisticRegression(
        C=params['C'],
        penalty=params['penalty'],
        solver=params['solver'],
        l1_ratio=params['l1_ratio'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])    

In [86]:
#Divide the search space to avoid inconsistencies
param_search_space = [
    #Liblinear only supports 'l1' and 'l2' penalties
    {
        'preproc__hash__hasher__n_features' : randint(4, 65),
        'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
        'model__C': loguniform(1e-4, 1e4),
        'model__penalty': ['l1'],
        'model__solver': ['liblinear'],
        'model__l1_ratio': uniform(0, 1),
        'model__class_weight': [None, 'balanced']
    }, 
    {
        'preproc__hash__hasher__n_features' : randint(4, 65),
        'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
        'model__C': loguniform(1e-4, 1e4),
        'model__penalty': ['l2'],
        'model__solver': ['liblinear'],
        'model__class_weight': [None, 'balanced']
    },    
    #The rest of solvers doesn't support 'l1'
    {
        'preproc__hash__hasher__n_features' : randint(4, 65),
        'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
        'model__C': loguniform(1e-4, 1e4),
        'model__penalty': ['l2'],
        'model__solver': ['lbfgs', 'newton-cg', 'newton-cholesky'],
        'model__class_weight': [None, 'balanced']
    }
]

In [None]:
models['LogisticRegression'] = {}
models['LogisticRegression']['pipeline'] = logistic_regression_pipeline
models['LogisticRegression']['hyperparams'] = hyperparams
models['LogisticRegression']['param_search'] = param_search_space

### RidgeClassifier 

In [97]:
from sklearn.linear_model import RidgeClassifier

hyperparams = {
    'n_features': 8,
    'alpha' : 1,
    'solver' : 'auto',
    'fit_intercept' : True,
    'class_weight' : None,
}

def ridge_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler()
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = RidgeClassifier(
        alpha=params['alpha'],
        solver=params['solver'],
        fit_intercept=params['fit_intercept'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])    

In [105]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__alpha': loguniform(1e-4, 1e4), 
    'model__solver': ['auto', 'lsqr', 'sparse_cg', 'sag'],
    'model__fit_intercept': [True, False],
    'model__class_weight': [None, 'balanced']
}

In [None]:
models['RidgeClassifier'] = {}
models['RidgeClassifier']['pipeline'] = ridge_classifier_pipeline
models['RidgeClassifier']['hyperparams'] = hyperparams
models['RidgeClassifier']['param_search'] = param_search_space

### PassiveAggressiveClassifier

In [112]:
from sklearn.linear_model import PassiveAggressiveClassifier

hyperparams = {
    'n_features': 8,
    'C' : 1,
    'fit_intercept' : True,
    'loss' : 'hinge',
    'average' : False,
    'class_weight' : None
}

def passive_aggressive_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler()
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = PassiveAggressiveClassifier(
        C=params['C'],
        fit_intercept=params['fit_intercept'],
        loss=params['loss'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])  

In [113]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__C': loguniform(1e-4, 1e4),
    'model__fit_intercept': [True, False],
    'model__loss': ['hinge', 'squared_hinge'],
    'model__class_weight': [None, 'balanced'],
    'model__average': [True, False]
}

In [114]:
models['PassiveAggressiveClassifier'] = {}
models['PassiveAggressiveClassifier']['pipeline'] = passive_aggressive_classifier_pipeline
models['PassiveAggressiveClassifier']['hyperparams'] = hyperparams
models['PassiveAggressiveClassifier']['param_search'] = param_search_space

### SGDClassifier 

In [117]:
for key in param_search_space.keys():
    if 'model__' in key:
        print(key.replace('model__', ''))

loss
penalty
alpha
l1_ratio
learning_rate
eta0
power_t
tol
class_weight
average


In [118]:
from sklearn.linear_model import SGDClassifier

hyperparams = {
    'n_features': 8,
    'loss' : 'hinge',
    'penalty' : 'l2',
    'alpha' : 0.0001,
    'l1_ratio' : 0.15,
    'learning_rate' : 'optimal',
    'eta0' : 0,
    'power_t' : 0.5,
    'average' : False,
    'class_weight' : None,
}

def sgd_classifier_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    rb_tf = RobustScaler()
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = SGDClassifier(
        loss=params['loss'],
        penalty=params['penalty'],
        alpha=params['alpha'],
        l1_ratio=params['l1_ratio'],
        learning_rate=params['learning_rate'],
        eta0=params['eta0'],
        power_t=params['power_t'],
        average=params['average'],
        class_weight=params['class_weight'],
        random_state=107 
    )
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])  

In [119]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__loss':            ['hinge', 'log_loss', 'modified_huber', 'squared_hinge'], 
    'model__penalty':         ['l2', 'l1', 'elasticnet'], 
    'model__alpha':           loguniform(1e-6, 1e-1), 
    'model__l1_ratio':        uniform(0.0, 1.0), 
    'model__learning_rate':   ['optimal', 'invscaling', 'adaptive'], 
    'model__eta0':            loguniform(1e-4, 1e-1), 
    'model__power_t':         uniform(0.1, 0.9), 
    'model__class_weight':    [None, 'balanced'], 
    'model__average':         [True, False],
}

In [120]:
models['SGDClassifier'] = {}
models['SGDClassifier']['pipeline'] = sgd_classifier_pipeline
models['SGDClassifier']['hyperparams'] = hyperparams
models['SGDClassifier']['param_search'] = param_search_space

In [121]:
HyperparameterSearch('SGDClassifier', sgd_classifier_pipeline(hyperparams), param_search_space)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameters for SGDClassifier:
{'model__alpha': np.float64(0.01316670892247693), 'model__average': False, 'model__class_weight': 'balanced', 'model__eta0': np.float64(0.05865221789706591), 'model__l1_ratio': np.float64(0.6280679266629924), 'model__learning_rate': 'adaptive', 'model__loss': 'log_loss', 'model__penalty': 'l2', 'model__power_t': np.float64(0.24572408959550807), 'preproc__hash__hasher__n_features': 43, 'preproc__robust__quantile_range': (1.0, 99.0)}
Best accuracy for SGDClassifier: 0.813


## Nearest Neighbors

### KNeighborsClassifier

## Tree-based models

### DecisionTreeClassifier

### RandomForestClassifier

This model is not affected too much by scaling, so I am going to leave untouch the numerical values.

In [48]:
from sklearn.ensemble import RandomForestClassifier
hyperparams = {
    'n_features': 8,
    'n_estimators' : 100,
    'criterion' : 'gini',
    'max_depth' : None,
    'min_samples_split' : 2,
    'min_samples_leaf' : 5,
    'max_features' : 'sqrt',
    'bootstrap' : True,
    'n_jobs' : None
}

def random_forest_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') #Replaces Hashing encoder
    #hashing_tf = HashingEncoder(n_components=params['n_components'], cols=hash_cols)
    
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'], 
        criterion=params['criterion'], 
        max_depth=params['max_depth'], 
        min_samples_split=params['min_samples_split'], 
        min_samples_leaf=params['min_samples_leaf'], 
        max_features=params['max_features'], 
        bootstrap=params['bootstrap'], 
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [50]:
param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__n_estimators': randint(100, 501),
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [None] + list(range(5, 51, 5)),
    'model__min_samples_split': randint(2, 21),
    'model__min_samples_leaf': randint(1, 11),
    'model__max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5],
    'model__bootstrap': [True, False],
    'model__class_weight': [None, 'balanced', 'balanced_subsample'],
    'model__ccp_alpha': uniform(0.0, 0.01)
}


### ExtraTreesClassifier

## Boosting

In [5]:
X = train_w.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1)
#X_test = test_w.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_w['Survived']

### GradientBoostingClassifier

### HistGradientBoostingClassifier

### AdaBoostClassifier

## Bayes

### GaussianNB 

### CategoricalNB

### BernoulliNB

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Mejores parámetros:
 {'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100, 'preproc__hash__hasher__n_features': 16}
Mejor accuracy en CV: 0.789




# Hyperparameter search

In [88]:
def HyperparameterSearch(name, pipeline, param_distributions):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=107)
    
    rand_search = RandomizedSearchCV(
        estimator=pipeline,
        error_score='raise',
        param_distributions=param_distributions,
        n_iter=100,               
        scoring='accuracy',
        cv=cv,
        verbose=2,
        random_state=107,
        n_jobs=-1
    )
    
    rand_search.fit(X, y)
    
    print(f'Best hyperparameters for {name}:\n{rand_search.best_params_}')
    print(f'Best accuracy for {name}: {rand_search.best_score_:.3f}')
    
    best_hyperparams = hyperparams.copy()
    for hyperparam, value in rand_search.best_params_.items():
        if 'model__' in hyperparam:
            best_hyperparams[hyperparam.replace('model__', '')] = value
        elif 'preproc__' in hyperparam:
            best_hyperparams['n_features'] = value
    
    best_model = rand_search.best_estimator_
    y_pred = best_model.predict(X)

In [100]:
HyperparameterSearch('RidgeClassifier', ridge_classifier_pipeline(hyperparams), param_search_space)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


ValueError: solver='cholesky' does not support fitting the intercept on sparse data. Please set the solver to 'auto' or 'lsqr', 'sparse_cg', 'sag', 'lbfgs' or set `fit_intercept=False`

In [None]:
pipe.fit(X_train, y_test)
y_pred = pipe.predict(X_train)

In [None]:
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy en test: {acc:.4f}")

In [18]:
scores = cross_val_score(
    estimator=pipe,         # tu pipeline (sin grid search)
    X=X, 
    y=y,
    cv=5,                   # número de pliegues
    scoring='accuracy',     # métrica
    n_jobs=-1
)

print("Accuracy CV por pliegue:", scores)
print("Media de accuracy CV: {:.4f} ± {:.4f}".format(scores.mean(), scores.std()))

Accuracy CV por pliegue: [0.75977654 0.79775281 0.79775281 0.7247191  0.82022472]
Media de accuracy CV: 0.7800 ± 0.0338


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=107,
                                                    stratify=y)