# Re-create your own One_Hot_Encoder 

## Load data

In [72]:
from sklearn import set_config; set_config(display='diagram')

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
data = sns.load_dataset('titanic').sample(frac=1)

In [3]:
train_frac = 0.7
n_train = round(len(data)*train_frac)
n_test = len(data) - n_train

data_train = data.iloc[:n_train,:]
data_test = data.iloc[n_train:,:]

X_train = data_train.drop(columns=['survived', 'alive','who','adult_male'])
y_train = data_train['survived']

X_test = data_test.drop(columns=['survived','alive','who','adult_male'])
y_test = data_test['survived']

X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
291,1,female,19.0,1,0,91.0792,C,First,B,Cherbourg,False
589,3,male,,0,0,8.0500,S,Third,,Southampton,True
304,3,male,,0,0,8.0500,S,Third,,Southampton,True
718,3,male,,0,0,15.5000,Q,Third,,Queenstown,True
219,2,male,30.0,0,0,10.5000,S,Second,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...
636,3,male,32.0,0,0,7.9250,S,Third,,Southampton,True
564,3,female,,0,0,8.0500,S,Third,,Southampton,True
389,2,female,17.0,0,0,12.0000,C,Second,,Cherbourg,True
339,1,male,45.0,0,0,35.5000,S,First,,Southampton,True


## A first pipe

üëâ Create a basic pipeline one-hot-encoding categorical features

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))

## Custom OHEncoder to keep track of column names?

In [8]:
# By default OneHot works with numpy and loses track of column names
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [9]:
# Hopefully we can access the one-hot-encoded names as follows
ohe.get_feature_names()



array(['x0_female', 'x0_male'], dtype=object)

üëâ Try to create your own OneHotEncoder so that it preserves the column names when piping

In [79]:
from sklearn.base import TransformerMixin, BaseEstimator

class Ohe(TransformerMixin,BaseEstimator):
    
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        self.list_of_values = []
        self.columns_ = X.columns
        X_copy = X.copy()
        for col in X_copy.columns:
            self.list_of_values.append(X_copy[col].unique())
        return self
    
    def transform(self,X,y=None):
        X_copy = X.copy()
        for col, values in zip(self.columns_,self.list_of_values):
            for value in values:
                X_copy[f"{col}_{value}"] = [1 if row==value else 0 for row in X_copy[col]]
        X_copy = X_copy.drop(columns = self.columns_)
        self.new_cols = X_copy.columns
        return X_copy

### Test it within a Pipeline and a ColumnTransformer

In [91]:
ohe = Ohe()
ohe.fit(X_train[['sex','class']])

In [90]:
ohe.transform(X_train[['sex','class']])

Unnamed: 0,sex_female,sex_male,class_First,class_Third,class_Second
291,1,0,1,0,0
589,0,1,0,1,0
304,0,1,0,1,0
718,0,1,0,1,0
219,0,1,0,0,1
...,...,...,...,...,...
636,0,1,0,1,0
564,1,0,0,1,0
389,1,0,0,0,1
339,0,1,1,0,0


**‚ö†Ô∏è But then again we lose column names when passing that into a ColumnTransformer!**

In [67]:
from sklearn.compose import ColumnTransformer

In [82]:
ct = ColumnTransformer([('ohe',ohe,['sex','class'])], remainder='passthrough')
X_new = ct.fit_transform(X_train)
display(X_train.head())
display(pd.DataFrame(X_new))

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
291,1,female,19.0,1,0,91.0792,C,First,B,Cherbourg,False
589,3,male,,0,0,8.05,S,Third,,Southampton,True
304,3,male,,0,0,8.05,S,Third,,Southampton,True
718,3,male,,0,0,15.5,Q,Third,,Queenstown,True
219,2,male,30.0,0,0,10.5,S,Second,,Southampton,True


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,0,1,0,0,1,19.0,1,0,91.0792,C,B,Cherbourg,False
1,0,1,0,1,0,3,,0,0,8.05,S,,Southampton,True
2,0,1,0,1,0,3,,0,0,8.05,S,,Southampton,True
3,0,1,0,1,0,3,,0,0,15.5,Q,,Queenstown,True
4,0,1,0,0,1,2,30.0,0,0,10.5,S,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,0,1,0,1,0,3,32.0,0,0,7.925,S,,Southampton,True
620,1,0,0,1,0,3,,0,0,8.05,S,,Southampton,True
621,1,0,0,0,1,2,17.0,0,0,12.0,C,,Cherbourg,True
622,0,1,1,0,0,1,45.0,0,0,35.5,S,,Southampton,True


**ü§Øü§Øü§Ø We also have to recode the ColumnTransformer ourself!**  

In [93]:
class ColumnTransformer2(TransformerMixin,BaseEstimator):
    
    def __init__(self,transformers):
        self.transformers = transformers
    
    def fit(self, X, y=None):
        X_copy = X.copy()
        for transformer in self.transformers:
            transformer[1].fit(X_copy[transformer[2]])
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for transformer in self.transformers:
            X_copy = transformer[1].transform(X_copy[transformer[2]])
        return X_copy
            

In [94]:
ct = ColumnTransformer2([('ohe',ohe,['sex','class'])])
X_new = ct.fit_transform(X_train)
display(X_train.head())
display(pd.DataFrame(X_new))

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
291,1,female,19.0,1,0,91.0792,C,First,B,Cherbourg,False
589,3,male,,0,0,8.05,S,Third,,Southampton,True
304,3,male,,0,0,8.05,S,Third,,Southampton,True
718,3,male,,0,0,15.5,Q,Third,,Queenstown,True
219,2,male,30.0,0,0,10.5,S,Second,,Southampton,True


Unnamed: 0,sex_female,sex_male,class_First,class_Third,class_Second
291,1,0,1,0,0
589,0,1,0,1,0
304,0,1,0,1,0
718,0,1,0,1,0
219,0,1,0,0,1
...,...,...,...,...,...
636,0,1,0,1,0
564,1,0,0,1,0
389,1,0,0,0,1
339,0,1,1,0,0


üèÅ In conclusion, it's rather difficult to keep column names and dataframes when dealing with pipelines in Sklearn.