<font color='#bb1c2a' size=5>
    <h2>
        <b>
$Why$ you should use sklearn for preprocessing and $how$ you should do it
            </b>
    </h2>
</font>
    

In [None]:
from collections import Counter
import random
import sys
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import mlflow
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import set_config
from sklearn.compose import ColumnTransformer, make_column_transformer
from IPython.display import Image, HTML
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier



TEST_SIZE, SEED = 0.5, 0
random.seed(SEED)
np.random.seed(SEED)
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'deck']
label = 'survived'
# Load dataset
ds = sns.load_dataset('titanic')
ds

In [None]:
ds[features + [label]].info()

<font color='#263d4d' size=3>
    <h1>
0.$\;$Preprocessing with pandas

In [None]:
X, y = ds[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']], ds[['survived']]

# Encode sex as 0 and 1 
X['sex'] = X['sex'].apply(lambda x: x=='male').astype(np.int8)
# Impute age's nan values with the mean
X.loc[X['age'].isna(), 'age'] = X['age'].mean()

# Evaluate model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
roc_auc_score(y_test, y_pred)

<font color='#1e78fa' size=1>
    <h1>
Now we want to use $embark\_town$ feature, it is a categorical feature. <br>
The most convenient way to do so is to use sklearn's OneHotEncoder.

<font color='#1e78fa' size=1>
    <h1>
Important notes about OneHotEncoder:<br>
    1. You can encode nan values with one extra column. <br>
    2. You can raise an error in case the OneHotEncoder face a new value which didn't appear in the fit. <br>
    3. You can drop one column to avoid collinearity (for example it a must in OLS solution for LinearRegression)

In [None]:
Counter(ds['embark_town'])

In [None]:
df = pd.DataFrame([(1,2), (3,4), (np.nan, 9)], columns=['a', 'b'])
df

In [None]:
# 1
o = OneHotEncoder(sparse=False)
print(o.fit_transform(df[['a']]))
o.categories_

In [None]:
# 2
o = OneHotEncoder(sparse=False, handle_unknown='error')
print(o.fit_transform(df.loc[:1, ['a']]))
try:
    o.transform(df.loc[[2], ['a']])
except ValueError as e:
    print(f'Error: {e}')

In [None]:
# 3
o = OneHotEncoder(sparse=False, drop='first')
print(o.fit_transform(df[['a']]))
o.categories_

<font color='#1e78fa' size=1>
    <h1>
Let's plug the OneHotEncoder into our model

In [None]:
X, y = ds[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embark_town']], ds[['survived']]

# Encode sex as 0 and 1 
X['sex'] = X['sex'].apply(lambda x: x=='male').astype(np.int8)
# Impute age's nan values with the mean
X.loc[X['age'].isna(), 'age'] = X['age'].mean()

# Encode embark_town with OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X[ohe.categories_[0]] = ohe.fit_transform(X[['embark_town']])
X = X.drop(['embark_town'], axis=1)

# Evaluate model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
roc_auc_score(y_test, y_pred)

<font color='#263d4d' size=3>
    <h1>
1.$\;$Regular preprocessing using sklearn and pandas

<font color='#1e78fa' size=2>
    <h1>
What's the problem with the following code?

In [None]:
X, y = ds[features], ds[['survived']]

# Encode sex as 0 and 1 
X['sex'] = X['sex'].apply(lambda x: x=='male').astype(np.int8)

# Standardize the features
si_age = SimpleImputer(strategy='mean')
si_deck = SimpleImputer(strategy='most_frequent')
ohe_deck = OneHotEncoder(sparse=False)
ss = StandardScaler()

scaled_features = list(set(X.columns) - set(['deck']))
X['age'] = si_age.fit_transform(X[['age']])
X['deck'] = si_deck.fit_transform(X[['deck']])
X[ohe_deck.categories_[0]] = ohe_deck.fit_transform(X[['deck']])
X = X.drop('deck', axis=1)
X[scaled_features] = ss.fit_transform(X[scaled_features])

# Evaluate model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
roc_auc_score(y_test, y_pred)

In [None]:
si_age.statistics_, si_deck.statistics_, ohe_deck.categories_, (ss.mean_, ss.scale_)

<font color='#1e78fa' size=2>
    <h1>
A possible fix up

In [None]:
X, y = ds[features], ds[['survived']]

# Encode sex as 0 and 1 
X['sex'] = X['sex'].apply(lambda x: x=='male').astype(np.int8)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)

# Standardize the features
si_age = SimpleImputer(strategy='mean')
si_deck = SimpleImputer(strategy='most_frequent')
ohe_deck = OneHotEncoder(sparse=False, handle_unknown='ignore')
ss = StandardScaler()

X_train['age'] = si_age.fit_transform(X_train[['age']])
X_train['deck'] = si_deck.fit_transform(X_train[['deck']])
X_train[ohe_deck.categories_[0]] = ohe_deck.fit_transform(X_train[['deck']])
X_train = X_train.drop('deck', axis=1)
X_train[scaled_features] = ss.fit_transform(X_train[scaled_features])

# Evaluate model
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
X_test['age'] = si_age.transform(X_test[['age']])
X_test['deck'] = si_deck.transform(X_test[['deck']])
X_test[ohe_deck.categories_[0]] = ohe_deck.transform(X_test[['deck']])
X_test = X_test.drop('deck', axis=1)
X_test[scaled_features] = ss.transform(X_test[scaled_features])

y_pred = clf.predict(X_test)
roc_auc_score(y_test, y_pred)

In [None]:
si_age.statistics_, si_deck.statistics_, ohe_deck.categories_, (ss.mean_, ss.scale_)

In [None]:
Image('img/burst-pipe.jpeg', width=1000)

<font color='#1e78fa' size=3>
    <h1>
        <b>
As we can see, a leakage in the preprocessing step is easier than we thought!<br>
We should treat the test set as unseen set, we cannot use it even to impute missing values, scale, or normalize, ever.

<font color='#263d4d' size=3>
    <h1>
2.$\;$Pipline and ColumnTranformer to the rescue!

<font color='#1e78fa' size=2>
    <h1>
        Pipeline - <i>Sequentially apply a list of transforms and (you don't have to!) a final estimator</i>.<br>
        ColumnTransformer - <i>Applies transformers to columns of an array or pandas DataFrame.</i>.
        <p></p>
    </h1>
    <h2>
Few notes about a pipeline:<br>
1. If your pipe's steps contain transforms only, you should use:<br>
        <b><p></p>
            <i>pipe.transform</i> / 
            <i>pipe.fit_transform</i>
        </b><br><p></p>
2. If your pipe's final step is an estimator, you should use:<br>
        <b><p></p>
            <i>pipe.predict</i> / 
            <i>pipe.fit_predict</i>
        </b>

In [None]:
X, y = ds[features + ['adult_male', 'alone']], ds[['survived']]

# Format: List of steps 
deck_pipe = Pipeline([('deck_si', SimpleImputer(strategy='most_frequent')),
                      ('deck_ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

# Format: The name of the transformation, the transformation itself, the columns we want to apply to
ct = ColumnTransformer([('sex_oe', OrdinalEncoder(handle_unknown='error'), ['sex']),
                        ('adult_male_oe', OrdinalEncoder(handle_unknown='error'), ['adult_male']),
                        ('alone_oe', OrdinalEncoder(handle_unknown='error'), ['alone']),
                        ('age_si', KNNImputer(n_neighbors=7), ['age']),
                        ('deck_transform', deck_pipe, ['deck'])],
                        remainder='passthrough')
pipe = Pipeline([('preprocessing', ct), 
                 ('model', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
roc_auc_score(y_test, y_pred)

In [None]:
Image('img/pipeline VS old.png', width=1000)

In [None]:
pipe

<font color='#1e78fa' size=5>
    <h1>
OR...

In [None]:
set_config(display='diagram')
pipe

<font color='#1e78fa' size=1>
    <h1>
You can easily argue I have a leakage in the code above, it seems like it has the same structure as the leaky one.<br>
Well, it doesn't, and I'll explain why.<br>
    </h1>
    </font>
    <font color='#03aaf9' size=1>
    <h1>
        First scenario, your last pipe step is an estimator:<br> 
        $\;\;\;\;\;$While calling pipe.fit(): <br>
        $\;\;\;\;\;$ $\;\;\;\;\;$* All the steps before the last one are actually calling step.fit_transform(). <br>
        $\;\;\;\;\;$ $\;\;\;\;\;$* The last step calls estimator.fit() only. <p></p>
        $\;\;\;\;\;$While calling pipe.predict(): <br>
        $\;\;\;\;\;$ $\;\;\;\;\;$* All the steps before the last one are actually calling step.transform(). <br>
        $\;\;\;\;\;$ $\;\;\;\;\;$* The last step calls estimator.predict() only. <p></p>
        Second scenario, all your steps are transforms:<br>
        $\;\;\;\;\;$While calling pipe.fit(): <br>
        $\;\;\;\;\;$ $\;\;\;\;\;$* All the steps call step.fit_transform(). <p></p> 
        $\;\;\;\;\;$While calling pipe.predict(): <br>
        $\;\;\;\;\;$ $\;\;\;\;\;$* <font color='red'>Wrong!</font> (you don't have an estimator, what you wanna predict?!?) <p></p>    
        $\;\;\;\;\;$While calling pipe.transform(): <br>
        $\;\;\;\;\;$ $\;\;\;\;\;$* All the steps call step.transform(). 
    </h1>
    </font>
    <p></p>
    <font color='#1e78fa' size=1>
    <h1>
    This way we guarantee no leakage.<br>
    It's also much cleaner, more powerful, and not to mention how beauty it is... 

<font color='#fab666' size=1>
    <h1>
    -----------------------------------------------------------------------------------------------------------------

<font color='#1e78fa' size=1>
    <h1>
        How can I access the different steps of the pipe? <b>by name of course</b>

In [None]:
set_config(display='text')

In [None]:
pipe.named_steps

In [None]:
pipe.named_steps['preprocessing']

In [None]:
pipe.named_steps['preprocessing'].named_transformers_

In [None]:
pipe.named_steps['preprocessing'].named_transformers_['age_si']

<font color='#263d4d' size=3>
    <h1>
3.$\;$ Anonymous Pipelines and ColumnTransformer

<font color='#1e78fa' size=1>
    <h1>
        Although anyone has a name, not every pipeline should have. <br>
        I find it clearer to use anonymous pipes when they are inner, and named pipes for the top-level ones<br>
        The same applies for ColumnTransformer.

In [None]:
X, y = ds[features + ['adult_male', 'alone']], ds[['survived']]

# Format: Annonymous steps
deck_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False, handle_unknown='ignore'))

# Format: Annonymous transformations
ct = make_column_transformer((OrdinalEncoder(handle_unknown='error'), ['sex']),
                             (OrdinalEncoder(handle_unknown='error'), ['adult_male']),
                             (OrdinalEncoder(handle_unknown='error'), ['alone']),
                             (KNNImputer(n_neighbors=7), ['age']),
                             (deck_pipe, ['deck']),
                             remainder='passthrough')
pipe = Pipeline([('preprocessing', ct), 
                 ('model', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
roc_auc_score(y_test, y_pred)

<p>

<font color='#1e78fa' size=4>
    <h1>
Allright, but I want to make my model even stronger!

<font color='#263d4d' size=3>
    <h1>
4.$\;$Hyperparameter optimization time!

In [None]:
cb_clf_params = {
    'model__learning_rate':     np.arange(0.05, 0.31, 0.05),
    'model__max_depth':         np.arange(5, 25, 1, dtype=int),
    'model__n_estimators':      [25, 100, 250],
    'model__l2_leaf_reg':       [0, 3, 5, 8, 12],
    
}
cb_fit_params = {
    'model__early_stopping_rounds': 10,
    'model__verbose': False
}

In [None]:
K_FOLDS = 5
N_COMBINATIONS = 5
set_config(display='diagram')

X, y = ds[features + ['adult_male', 'alone']], ds[['survived']]

# Format: Annonymous steps
deck_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False, handle_unknown='ignore'))

# Format: Annonymous transforms
ct = make_column_transformer((OrdinalEncoder(handle_unknown='error'), ['sex']),
                             (OrdinalEncoder(handle_unknown='error'), ['adult_male']),
                             (OrdinalEncoder(handle_unknown='error'), ['alone']),
                             (KNNImputer(n_neighbors=7), ['age']),
                             (deck_pipe, ['deck']),
                             remainder='passthrough')

pipe = Pipeline([('preprocessing', ct), 
                 ('model', CatBoostClassifier(verbose=False))])

grid = RandomizedSearchCV(pipe, cb_clf_params, cv=K_FOLDS, scoring='roc_auc', n_iter=N_COMBINATIONS, random_state=SEED)
grid.fit(X, y)
grid

<font color='#1e78fa' size=1>
    <h1>
        It's really usefull to have a look at the test results for each set of hyperparameters.

In [None]:
# Analyze results
results = pd.DataFrame(grid.cv_results_)[['params', 'rank_test_score', 'mean_test_score', *[f'split{i}_test_score' for i in range(K_FOLDS)]]].sort_values(['rank_test_score'])
results

<font color='#1e78fa' size=1>
    <h1>
        We can also access the (trained) best estimator and it's associated test result.

In [None]:
grid.best_estimator_, grid.best_score_

<font color='#263d4d' size=3>
    <h1>
        5.$\;$ Becoming greedy, how do we do HPO for <b>multiple models</b> at the same time?

In [None]:
cb_clf_params = {
    'model__learning_rate':     np.arange(0.05, 0.31, 0.05),
    'model__max_depth':         np.arange(5, 25, 1, dtype=int),
    'model__n_estimators':      [25, 100, 250],
    'model__l2_leaf_reg':       [0, 3, 5, 8, 12],
    'model': [CatBoostClassifier(verbose=False)]
}

lgb_clf_params = {
    'model__learning_rate':     np.arange(0.05, 0.31, 0.05),
    'model__max_depth':         np.arange(5, 25, 1, dtype=int),
    'model__n_estimators':      [25, 100, 250],
    'model__reg_lambda': [0, 0.05, 0.1, 0.2],
    'model__num_leaves': np.linspace(20, 500, 50, dtype=int),
    'model': [LGBMClassifier(verbose=None)]
}

models_dict = [cb_clf_params, lgb_clf_params]

In [None]:
%%capture cap
K_FOLDS = 5
N_COMBINATIONS = 5

X, y = ds[features + ['adult_male', 'alone']], ds[['survived']]

# Format: Annonymous steps
deck_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False, handle_unknown='ignore'))

# Format: Annonymous transforms
ct = make_column_transformer((OrdinalEncoder(handle_unknown='error'), ['sex']),
                             (OrdinalEncoder(handle_unknown='error'), ['adult_male']),
                             (OrdinalEncoder(handle_unknown='error'), ['alone']),
                             (KNNImputer(n_neighbors=7), ['age']),
                             (deck_pipe, ['deck']),
                             remainder='passthrough')

pipe = Pipeline([('preprocessing', ct), 
                 ('model', CatBoostClassifier(verbose=False))])

grid = RandomizedSearchCV(pipe, models_dict, cv=K_FOLDS, scoring='roc_auc', n_iter=N_COMBINATIONS, random_state=SEED)
grid.fit(X, y)

In [None]:
# Analyze results
results = pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'rank_test_score']].sort_values(['rank_test_score'])
results

<font color='#1e78fa' size=4>
    <h1>
        <b>And the winner is......</b>

In [None]:
results.iloc[0]['params']['model']

<font color='#1e78fa' size=4>
    <h1>
        <b>With a score of......</b>

In [None]:
results.iloc[0]['mean_test_score']

<font color='#263d4d' size=3>
    <h1>
        6.$\;$ Create your own classifier and add it to the pipe

In [None]:
class MatansModel(BaseEstimator, ClassifierMixin):
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        self.cb = CatBoostClassifier(**{'n_estimators': 25, 'max_depth': 13, 'learning_rate': 0.3, 'l2_leaf_reg': 3, 'verbose':False})
        self.lgb = LGBMClassifier(**{'learning_rate':0.2, 'max_depth':13, 'n_estimators':25, 'num_leaves':49, 'reg_lambda':0.05, 'verbose':None})
        self.lr = LogisticRegression()
    
    def fit(self, X, y=None):
        self.cb.fit(X, y)
        self.lgb.fit(X, y)
        self.lr.fit(X, y)
    
    def predict(self, X):
        preds_arr = np.zeros(shape=(X.shape[0], 3))
        preds_arr[:,0] = self.cb.predict(X).flatten()
        preds_arr[:,1] = self.lgb.predict(X).flatten()
        preds_arr[:,2] = self.lr.predict(X).flatten()
        
        result_arr = preds_arr.sum(axis=1)
        result_arr[result_arr<2] = 0
        result_arr[result_arr>=2] = 1
        return result_arr
    
    def score(self, X, y):
        return roc_auc_score(y, self.predict(X))

In [None]:
%%capture c
X, y = ds[features + ['adult_male', 'alone']], ds[['survived']]

# Format: Annonymous steps
deck_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False, handle_unknown='ignore'))

# Format: Annonymous transformations
ct = make_column_transformer((OrdinalEncoder(handle_unknown='error'), ['sex']),
                             (OrdinalEncoder(handle_unknown='error'), ['adult_male']),
                             (OrdinalEncoder(handle_unknown='error'), ['alone']),
                             (KNNImputer(n_neighbors=7), ['age']),
                             (deck_pipe, ['deck']),
                             remainder='passthrough')
pipe = Pipeline([('preprocessing', ct), 
                 ('model', MatansModel())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
# Same same
pipe.score(X_test, y_test), roc_auc_score(y_test, y_pred)

In [None]:
pipe