# Performing one-hot encoding of frequent categories

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from feature_engine.encoding import OneHotEncoder

In [2]:
data = pd.read_csv('../data/credit_approval_uci.csv')
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis='columns'), data['target'],
    test_size=.3,
    random_state=0
)

In [3]:
X_train['A6'].unique()

array(['c', 'q', 'w', 'ff', 'm', 'i', 'e', 'cc', 'x', 'd', 'k', 'j',
       'Missing', 'aa', 'r'], dtype=object)

In [4]:
X_train['A6'].value_counts().sort_values(ascending=False).head(5)

c     93
q     56
w     48
i     41
ff    38
Name: A6, dtype: int64

In [5]:
top_5 = [
    x for x in X_train['A6'].value_counts().sort_values(
        ascending=False).head(5).index
]
top_5

['c', 'q', 'w', 'i', 'ff']

In [6]:
for label in top_5:
    X_train[f'A6_{label}'] = np.where(X_train['A6'] == label, 1, 0)
    X_test[f'A6_{label}'] = np.where(X_test['A6'] == label, 1, 0)

In [7]:
X_train[['A6']+ [f'A6_{label}' for label in top_5]].sample(10)

Unnamed: 0,A6,A6_c,A6_q,A6_w,A6_i,A6_ff
173,k,0,0,0,0,0
429,cc,0,0,0,0,0
67,m,0,0,0,0,0
26,i,0,0,0,1,0
248,c,1,0,0,0,0
280,c,1,0,0,0,0
647,aa,0,0,0,0,0
587,c,1,0,0,0,0
423,cc,0,0,0,0,0
432,k,0,0,0,0,0


We can automate one-hot encoding of frequent categories with Feature-engine. 

In [8]:
data = pd.read_csv('../data/credit_approval_uci.csv')
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis='columns'), data['target'],
    test_size=.3,
    random_state=0
)

In [9]:
ohe_enc = OneHotEncoder(top_categories=5, variables=['A6', 'A7'])
ohe_enc.fit(X_train)
X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [10]:
X_train_enc.sample(10)

Unnamed: 0,A1,A2,A3,A4,A5,A8,A9,A10,A11,A12,...,A6_c,A6_q,A6_w,A6_i,A6_ff,A7_v,A7_h,A7_ff,A7_bb,A7_z
163,b,32.0,0.0,y,p,0.0,Missing,Missing,0,t,...,0,0,0,0,0,0,1,0,0,0
429,b,33.58,0.335,y,p,0.085,f,f,0,f,...,0,0,0,0,0,1,0,0,0,0
100,b,37.5,0.0,y,p,0.0,Missing,Missing,0,t,...,1,0,0,0,0,0,0,0,1,0
399,b,31.0,2.085,u,g,0.085,f,f,0,f,...,1,0,0,0,0,1,0,0,0,0
635,b,18.17,2.46,u,g,0.96,f,t,2,t,...,1,0,0,0,0,0,0,0,0,0
617,b,32.25,14.0,y,p,0.0,f,t,2,f,...,0,0,0,0,1,0,0,1,0,0
520,Missing,20.42,7.5,u,g,1.5,t,t,1,f,...,0,0,0,0,0,1,0,0,0,0
664,b,31.08,0.0,y,p,0.0,Missing,Missing,0,f,...,0,0,1,0,0,1,0,0,0,0
237,b,21.33,7.5,u,g,1.415,t,t,1,f,...,0,0,0,0,0,1,0,0,0,0
546,b,23.58,0.46,y,p,2.625,t,t,6,t,...,0,0,1,0,0,1,0,0,0,0


In [11]:
ohe_enc.encoder_dict_

{'A6': ['c', 'q', 'w', 'i', 'ff'], 'A7': ['v', 'h', 'ff', 'bb', 'z']}