# Re-create your own One_Hot_Encoder 

## Load data

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
data = sns.load_dataset('titanic').sample(frac=1)

In [3]:
train_frac = 0.7
n_train = round(len(data)*train_frac)
n_test = len(data) - n_train

data_train = data.iloc[:n_train,:]
data_test = data.iloc[n_train:,:]

X_train = data_train.drop(columns=['survived', 'alive','who','adult_male'])
y_train = data_train['survived']

X_test = data_test.drop(columns=['survived','alive','who','adult_male'])
y_test = data_test['survived']

X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
191,2,male,19.0,0,0,13.0000,S,Second,,Southampton,True
742,1,female,21.0,2,2,262.3750,C,First,B,Cherbourg,False
737,1,male,35.0,0,0,512.3292,C,First,B,Cherbourg,True
408,3,male,21.0,0,0,7.7750,S,Third,,Southampton,True
543,2,male,32.0,1,0,26.0000,S,Second,,Southampton,False
...,...,...,...,...,...,...,...,...,...,...,...
120,2,male,21.0,2,0,73.5000,S,Second,,Southampton,False
32,3,female,,0,0,7.7500,Q,Third,,Queenstown,True
128,3,female,,1,1,22.3583,C,Third,F,Cherbourg,False
226,2,male,19.0,0,0,10.5000,S,Second,,Southampton,True


## A first pipe

👉 Create a basic pipeline one-hot-encoding categorical features

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


num_features = ['age','fare', 'sibsp',  'parch']
cat_features = ['pclass' , 'sex', 'embarked','class','deck','embark_town','alone']

from sklearn.preprocessing import MinMaxScaler

# Impute then Scale for numerical variables: 
num_transformer = Pipeline([
    ('scaler', MinMaxScaler())])


from sklearn.preprocessing import OneHotEncoder
# Encode categorical variables
cat_transformer = OneHotEncoder(handle_unknown='ignore')

#Select features automatically according to their type
from sklearn.compose import make_column_selector

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(num_features)),
    ('cat_transformer', cat_transformer, make_column_selector(cat_features))],
    remainder='passthrough')

## Custom OHEncoder to keep track of column names?

In [None]:
# By default OneHot works with numpy and loses track of column names
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

In [None]:
# Hopefully we can access the one-hot-encoded names as follows
ohe.get_feature_names()

👉 Try to create your own OneHotEncoder so that it preserves the column names when piping

In [1]:
# Custom OHE

### Test it within a Pipeline and a ColumnTransformer

In [2]:
# Test within a Pipeline

**⚠️ But then again we lose column names when passing that into a ColumnTransformer!**

In [4]:
# Test within a ColumnTransformer

**🤯🤯🤯 We also have to recode the ColumnTransformer ourself!**  

In [6]:
# Create a custom ColumnTransformer class to keep track of column names

🏁 In conclusion, it's rather difficult to keep column names and dataframes when dealing with pipelines in Sklearn.