# Creating binary variables through one-hot encoding

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../data/credit_approval_uci.csv')

In [57]:
#  let’s do one-hot encoding using pandas
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis='columns'), data['target'],
    test_size=.3,
    random_state=0
)

X_train['A4'].unique()

dummies = pd.get_dummies(X_train['A4'], drop_first=True)
dummies.head()

X_train_enc = pd.get_dummies(X_train, drop_first=True)
X_test_enc = pd.get_dummies(X_test, drop_first=True)
X_train_enc.head()

X_train_enc.columns

X_test_enc = pd.concat([X_test, X_test_enc], axis='columns')

X_test_enc.drop(
    labels=X_test_enc.select_dtypes(include='O').columns,
    axis='columns',
    inplace=True
)

In [58]:
#  let’s do one-hot encoding using scikit-learn
from sklearn.preprocessing import OneHotEncoder

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis='columns'), data['target'],
    test_size=.3,
    random_state=0
)

# handle_unknown='ignore'
encoder = OneHotEncoder(drop='first', sparse=False)

vars_categorical = X_train.select_dtypes(include='O').columns.to_list()

encoder.fit(X_train[vars_categorical])

encoder.categories_

X_train_enc = encoder.transform(
    X_train[vars_categorical]
)

X_test_enc = encoder.transform(
    X_test[vars_categorical]
)

encoder.get_feature_names_out()

X_test_enc = pd.DataFrame(X_test_enc)
X_test_enc.columns = encoder.get_feature_names_out()

X_test_enc.index = X_test.index

X_test_enc = pd.concat([X_test, X_test_enc], axis='columns')

X_test_enc.drop(
    labels=X_test_enc.select_dtypes(include='O').columns,
    axis='columns',
    inplace=True
)

In [59]:
# let’s perform one-hot encoding with Feature-engine
from feature_engine.encoding import OneHotEncoder

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis='columns'), data['target'],
    test_size=.3,
    random_state=0
)

ohe_enc = OneHotEncoder(drop_last=True)
ohe_enc.fit(X_train)

ohe_enc.variables_

ohe_enc.encoder_dict_

X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

ohe_enc.get_feature_names_out()

['A2',
 'A3',
 'A8',
 'A11',
 'A14',
 'A15',
 'A1_a',
 'A1_b',
 'A4_u',
 'A4_y',
 'A4_Missing',
 'A5_g',
 'A5_p',
 'A5_Missing',
 'A6_c',
 'A6_q',
 'A6_w',
 'A6_ff',
 'A6_m',
 'A6_i',
 'A6_e',
 'A6_cc',
 'A6_x',
 'A6_d',
 'A6_k',
 'A6_j',
 'A6_Missing',
 'A6_aa',
 'A7_v',
 'A7_ff',
 'A7_h',
 'A7_dd',
 'A7_z',
 'A7_bb',
 'A7_j',
 'A7_Missing',
 'A7_n',
 'A9_t',
 'A9_f',
 'A10_t',
 'A10_f',
 'A12_t',
 'A13_g',
 'A13_s']