In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder

# for one hot encoding with feature-engine
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

### OneHot Encoding
--------------------------------
Just like imputation, all methods of categorical encoding should be performed over the training set, and then propagated to the test set. 

Why? 

Because these methods will "learn" patterns from the train data, and therefore you want to avoid leaking information and overfitting. But more importantly, because we don't know whether in future / live data, we will have all the categories present in the train data, or if there will be more or less categories. Therefore, we want to anticipate this uncertainty by setting the right processes right from the start. We want to create transformers that learn the categories from the train set, and used those learned categories to create the dummy variables in both train and test sets.

--------------------------------

- pandas
- sklearn
- feature engine

In [3]:
# load dataset
data = pd.read_csv('data/titanic.csv',
                   usecols=['sex', 'embarked', 'cabin', 'survived'])
data.head()

Unnamed: 0,survived,sex,cabin,embarked
0,0,male,,S
1,1,female,C85,C
2,1,female,,S
3,1,female,C123,S
4,0,male,,S


In [4]:
# let's capture only the first letter of the 
# cabin for this demonstration

data['cabin'] = data['cabin'].str[0]

data.head()

Unnamed: 0,survived,sex,cabin,embarked
0,0,male,,S
1,1,female,C,C
2,1,female,,S
3,1,female,C,S
4,0,male,,S


In [5]:
# let's separate into training and testing set

x_train, x_test, y_train, y_test = train_test_split(
    data[['sex', 'embarked', 'cabin']],  # predictors
    data['survived'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

x_train.shape, x_test.shape

((623, 3), (268, 3))

In [6]:
x_train.dtypes

sex         object
embarked    object
cabin       object
dtype: object

In [8]:
# exploring cardinality
# sex
x_train['sex'].unique(), x_train['embarked'].unique(), x_train['cabin'].unique()

(array(['male', 'female'], dtype=object),
 array(['S', 'C', 'Q', nan], dtype=object),
 array(['E', 'D', nan, 'B', 'C', 'A', 'F', 'G', 'T'], dtype=object))

In [9]:
## into k dummy variables
tmp = pd.get_dummies(x_train["sex"])
tmp.head()

Unnamed: 0,female,male
857,0,1
52,1,0
386,0,1
124,0,1
578,1,0


In [10]:
# for better visualisation let's put the dummies next
# to the original variable

pd.concat([x_train['sex'],
           pd.get_dummies(x_train['sex'])], axis=1).head()

Unnamed: 0,sex,female,male
857,male,0,1
52,female,1,0
386,male,0,1
124,male,0,1
578,female,1,0


In [11]:
# let's get encodings for all categoricals
tmp = pd.get_dummies(x_train)
tmp.head()

Unnamed: 0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,cabin_A,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T
857,0,1,0,0,1,0,0,0,0,1,0,0,0
52,1,0,1,0,0,0,0,0,1,0,0,0,0
386,0,1,0,0,1,0,0,0,0,0,0,0,0
124,0,1,0,0,1,0,0,0,1,0,0,0,0
578,1,0,1,0,0,0,0,0,0,0,0,0,0


In [12]:
tmp = pd.get_dummies(x_test)
tmp.head()

## notice the change in the dimension size for test?

Unnamed: 0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,cabin_A,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G
495,0,1,1,0,0,0,0,0,0,0,0,0
648,0,1,0,0,1,0,0,0,0,0,0,0
278,0,1,0,1,0,0,0,0,0,0,0,0
31,1,0,1,0,0,0,1,0,0,0,0,0
255,1,0,1,0,0,0,0,0,0,0,0,0


In [13]:
## let's try to encode into k-1 categories

tmp = pd.get_dummies(x_train, drop_first=True)
tmp.head()

Unnamed: 0,sex_male,embarked_Q,embarked_S,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T
857,1,0,1,0,0,0,1,0,0,0
52,0,0,0,0,0,1,0,0,0,0
386,1,0,1,0,0,0,0,0,0,0
124,1,0,1,0,0,1,0,0,0,0
578,0,0,0,0,0,0,0,0,0,0


In [14]:
### get_dummies() can also handle missing values
tmp = pd.get_dummies(x_train, drop_first=True, dummy_na=True)
tmp.head()

Unnamed: 0,sex_male,sex_nan,embarked_Q,embarked_S,embarked_nan,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T,cabin_nan
857,1,0,0,1,0,0,0,0,1,0,0,0,0
52,0,0,0,0,0,0,0,1,0,0,0,0,0
386,1,0,0,1,0,0,0,0,0,0,0,0,1
124,1,0,0,1,0,0,0,1,0,0,0,0,0
578,0,0,0,0,0,0,0,0,0,0,0,0,1


#### OneHotEncoding - scikit-learn

<b>Advantages:</b>

- Fast
- Creates the same number of features in train and test set

<b>Limitations:</b>

- it returns a numpy array instead of a pandas dataframe
- it does not return the variable names, therefore inconvenient for variable exploration

In [15]:
# create the encoder and fit to train
encoder = OneHotEncoder(
    categories="auto",
    drop="first", # this returns k-1 categories
    sparse=False,
    handle_unknown="error" # this makes sure rare labels are encoded
)
encoder.fit(x_train.fillna("MISSING"))

OneHotEncoder(drop='first', sparse=False)

In [16]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'MISSING', 'Q', 'S'], dtype=object),
 array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'MISSING', 'T'], dtype=object)]

In [17]:
# transform
tmp = encoder.transform(x_train.fillna("MISSING"))
# this retuns a numpy array 
pd.DataFrame(tmp).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
# we can get the column names by 
encoder.get_feature_names()

array(['x0_male', 'x1_MISSING', 'x1_Q', 'x1_S', 'x2_B', 'x2_C', 'x2_D',
       'x2_E', 'x2_F', 'x2_G', 'x2_MISSING', 'x2_T'], dtype=object)

In [19]:
## let's construct the test set by doing the same
tmp = encoder.transform(x_test.fillna("MISSING"))
tmp = pd.DataFrame(tmp, columns=encoder.get_feature_names())
tmp.head()

Unnamed: 0,x0_male,x1_MISSING,x1_Q,x1_S,x2_B,x2_C,x2_D,x2_E,x2_F,x2_G,x2_MISSING,x2_T
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
