In [45]:
import pandas as pd
import datetime as dp
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from category_encoders import HashingEncoder, TargetEncoder
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score

In [2]:
dataset_path = 'E:/Datasets/titanic/wrangled dataset'

In [3]:
train_w = pd.read_csv(f'{dataset_path}/train.csv')
test_w = pd.read_csv(f'{dataset_path}/test.csv')

In [4]:
train_w.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family', 'Title',
       'Deck', 'TicketPrefix'],
      dtype='object')

In [44]:
models = {}

# Pipelines

## Linear Models

In [None]:
# TODO PCA

### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

hyperparams = {
}

def logistic_regression_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    std_cols = ['Sex', 'Age']
    ord_cols = ['Pclass', 'SibSp', 'Parch', 'Family']
    rob_cols = ['Fare']
    hash_cols = ['Title', 'Deck', 'TicketPrefix']
    cat_cols = ['Embarked']

    ss_tf = StandardScaler()
    ord_tf = OrdinalEncoder()
    rb_tf = RobustScaler()
    ohe_tf = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict')

    model = LogisticRegression()
    
    preprocessor = ColumnTransformer([
        ('standard', ss_tf, std_cols),
        ('ordinal', ord_tf, ord_cols), #This is redundant but it is helpfull to maintain order
        ('robust', rb_tf, rob_cols),
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])    

In [None]:
logistic_regression_param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'preproc__robust__quantile_range': [(1.0,99.0), (5.0,95.0), (10.0,90.0), (25.0,75.0)],
    'model__C': loguniform(1e-4, 1e4),
    'model__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'model__solver': ['saga'],
    'model__l1_ratio': uniform(0, 1),
    'model__class_weight': [None, 'balanced', 'balanced_subsample'],
    'model__multi_class': ['auto', 'ovr', 'multinomial'],
    'model__tol': loguniform(1e-5, 1e-1),
    'model__max_iter': [100, 200, 500]    
}

In [None]:
models['LogisticRegression'] = {}
models['LogisticRegression']['pipeline'] = logistic_regression_pipeline
models['LogisticRegression']['param_search'] = logistic_regression_param_search_space

### RidgeClassifier 

### PassiveAggressiveClassifier

### SGDClassifier 

## Nearest Neighbors

### KNeighborsClassifier

## Tree-based models

### DecisionTreeClassifier

### RandomForestClassifier

This model is not affected too much by scaling, so I am going to leave untouch the numerical values.

In [40]:
from sklearn.ensemble import RandomForestClassifier
hyperparams = {
    'n_features': 8,
    'n_estimators' : 100,
    'criterion' : 'gini',
    'max_depth' : None,
    'min_samples_split' : 2,
    'min_samples_leaf' : 5,
    'max_features' : 'sqrt',
    'bootstrap' : True,
    'n_jobs' : None
}

def random_forest_pipeline(params):
    def df_to_dicts(X):
        return X[hash_cols].astype(str).to_dict(orient='records')
        
    hash_cols   = ['Title', 'Deck', 'TicketPrefix']
    cat_cols    = ['Embarked']
    ohe_tf     = OneHotEncoder(handle_unknown='ignore', drop='first')
    dict_tf = FunctionTransformer(df_to_dicts, validate=False)
    hasher = FeatureHasher(n_features=params['n_features'], input_type='dict') #Replaces Hashing encoder
    #hashing_tf = HashingEncoder(n_components=params['n_components'], cols=hash_cols)
    
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'], 
        criterion=params['criterion'], 
        max_depth=params['max_depth'], 
        min_samples_split=params['min_samples_split'], 
        min_samples_leaf=params['min_samples_leaf'], 
        max_features=params['max_features'], 
        bootstrap=params['bootstrap'], 
        random_state=107
    )
    
    preprocessor = ColumnTransformer([
        ('hash',  Pipeline([('to_dict', dict_tf), ('hasher', hasher)]), hash_cols),
        ('ohe',    ohe_tf,    cat_cols),
        ('num_passthrough', 'passthrough', make_column_selector(dtype_include=np.number)),
    ], remainder='drop')
    
    return Pipeline(steps=[('preproc', preprocessor), ('model', model)])

In [43]:
random_forest_param_search_space = {
    'preproc__hash__hasher__n_features' : randint(4, 65),
    'model__n_estimators': randint(100, 501),
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [None] + list(range(5, 51, 5)),
    'model__min_samples_split': randint(2, 21),
    'model__min_samples_leaf': randint(1, 11),
    'model__max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5],
    'model__bootstrap': [True, False],
    'model__class_weight': [None, 'balanced', 'balanced_subsample'],
    'model__ccp_alpha': uniform(0.0, 0.01)
}


### ExtraTreesClassifier

## Boosting

In [5]:
X = train_w.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1)
#X_test = test_w.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_w['Survived']

### GradientBoostingClassifier

### HistGradientBoostingClassifier

### AdaBoostClassifier

## Bayes

### GaussianNB 

### CategoricalNB

### BernoulliNB

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Mejores parámetros:
 {'model__max_depth': 5, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100, 'preproc__hash__hasher__n_features': 16}
Mejor accuracy en CV: 0.789




# Hyperparameter search

In [None]:
def HyperparameterSearch(name, pipeline, param_distributions):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=107)
    
    rand_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=100,               
        scoring='accuracy',
        cv=cv,
        verbose=2,
        random_state=107,
        n_jobs=-1
    )
    
    rand_search.fit(X, y)
    
    print(f'Best hyperparameters for {name}:\n{rand_search.best_params_}')
    print(f'Best accuracy for {name}: {rand_search.best_score_:.3f}')
    
    best_hyperparams = hyperparams.copy()
    for hyperparam, value in rand_search.best_params_.items():
        if 'model__' in hyperparam:
            best_hyperparams[hyperparam.replace('model__', '')] = value
        elif 'preproc__' in hyperparam:
            best_hyperparams['n_features'] = value
    
    best_model = rand_search.best_estimator_
    y_pred = best_model.predict(X)

In [None]:
pipe.fit(X_train, y_test)
y_pred = pipe.predict(X_train)

In [None]:
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy en test: {acc:.4f}")

In [18]:
scores = cross_val_score(
    estimator=pipe,         # tu pipeline (sin grid search)
    X=X, 
    y=y,
    cv=5,                   # número de pliegues
    scoring='accuracy',     # métrica
    n_jobs=-1
)

print("Accuracy CV por pliegue:", scores)
print("Media de accuracy CV: {:.4f} ± {:.4f}".format(scores.mean(), scores.std()))

Accuracy CV por pliegue: [0.75977654 0.79775281 0.79775281 0.7247191  0.82022472]
Media de accuracy CV: 0.7800 ± 0.0338


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=107,
                                                    stratify=y)