# Re-create your own One_Hot_Encoder 

## Load data

In [4]:
import pandas as pd
import seaborn as sns

In [5]:
data = sns.load_dataset('titanic').sample(frac=1)

In [7]:
train_frac = 0.7
n_train = round(len(data)*train_frac)
n_test = len(data) - n_train

data_train = data.iloc[:n_train,:]
data_test = data.iloc[n_train:,:]

X_train = data_train.drop(columns=['survived', 'alive','who','adult_male'])
y_train = data_train['survived']

X_test = data_test.drop(columns=['survived','alive','who','adult_male'])
y_test = data_test['survived']

X_train.head(3)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
156,3,female,16.0,0,0,7.7333,Q,Third,,Queenstown,True
687,3,male,19.0,0,0,10.1708,S,Third,,Southampton,True
656,3,male,,0,0,7.8958,S,Third,,Southampton,True


## A first pipe

👉 Create a basic pipeline one-hot-encoding categorical features

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import set_config; set_config(display='diagram')

In [None]:
num_encoder = Pipeline([
                        ("imputer", SimpleImputer(strategy="mean")),
                        ("scaler", MinMaxScaler())
                       ])

In [None]:
cat_encoder = Pipeline ([
                        ("imputer", SimpleImputer(strategy="most_frequent")),
                        ( "encoder", OneHotEncoder(sparse=False))
                        ])

## Custom OHEncoder to keep track of column names?

In [0]:
# By default OneHot works with numpy and loses track of column names
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

In [0]:
# Hopefully we can access the one-hot-encoded names as follows
ohe.get_feature_names()

👉 Try to create your own OneHotEncoder so that it preserves the column names when piping

In [0]:
# Custom OHE

### Test it within a Pipeline and a ColumnTransformer

In [0]:
# Test within a Pipeline

**⚠️ But then again we lose column names when passing that into a ColumnTransformer!**

In [0]:
# Test within a ColumnTransformer

**🤯🤯🤯 We also have to recode the ColumnTransformer ourself!**  

In [0]:
# Create a custom ColumnTransformer class to keep track of column names

🏁 In conclusion, it's rather difficult to keep column names and dataframes when dealing with pipelines in Sklearn.