In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder as OneHotEncoder_sklearn
from sklearn.preprocessing import OrdinalEncoder as OrdinalEncoder_sklearn

from feature_engine.encoding import OneHotEncoder as OneHotEncoder_feature_engine
from feature_engine.encoding import OrdinalEncoder as OrdinalEncoder_feature_engine
from feature_engine.encoding import CountFrequencyEncoder as CountFrequencyEncoder_feature_engine
from feature_engine.encoding import MeanEncoder as MeanEncoder_feature_engine
from feature_engine.encoding import WoEEncoder as WoEEncoder_feature_engine
from feature_engine.encoding import RareLabelEncoder as RareLabelEncoder_feature_engine

from category_encoders import BinaryEncoder
from category_encoders import HashingEncoder

df = pd.read_csv(r"C:\Users\Honey\Desktop\Feature_Engineering_Cookbook\Datasets\creditApproval.csv")

### 1. Creating binary variables through one-hot encoding

In [2]:
# one-hot encoding using pandas get_dummies() function - detects categorical variables automatically
df_encoded_1a = pd.get_dummies(df,drop_first=True)

In [3]:
# one-hot encoding using scikit-learn - can't detect categorical variables automatically
encoder = OneHotEncoder_sklearn(categories='auto',drop='first',sparse=False)
encoder.fit(df[df.columns[df.dtypes == 'O']])
df_encoded_1b = encoder.transform(df[df.columns[df.dtypes == "O"]])
df_encoded_1b = pd.DataFrame(df_encoded_1b)

In [4]:
# one-hot encoding using feature-engine - detects categorical variables automatically
encoder = OneHotEncoder_feature_engine(top_categories = None, drop_last=True)
encoder.fit(df)
encoder.encoder_dict_
df_encoded_1c = encoder.transform(df)

### 2. Performing one-hot encoding of frequent categories

In [5]:
# one-hot encoding using numpy where() function
top_5_categories = [category for category in df["A5"].value_counts().sort_values(ascending=False).head().index]
df_encoded_2a = pd.DataFrame()
for category in top_5_categories:
    df_encoded_2a["A5_"+category] = np.where(df["A5"]==category,1,0)

In [6]:
# one-hot encoding using feature-engine
encoder = OneHotEncoder_feature_engine(top_categories=5,variables=["A5","A6"],drop_last = False)
encoder.fit(df)
encoder.encoder_dict_  
df_encoded_2c = encoder.transform(df)

### 3. Replacing categories with ordinal numbers

In [7]:
# # one-hot encoding using enumerate() and map() functions - enumerate is useful for obtaining an indexed list
# created a dictionary of key-value pairs, where each key was one of the unique categories, and each value was a digit that would replace the category
ordinal_mapping = {key:value for value, key in enumerate(df["A6"].unique(),0)}
df_encoded_3a = pd.DataFrame()
df_encoded_3a["A6"] = df["A6"].map(ordinal_mapping)

In [8]:
categorical_columns = [category for category in df.columns[df.dtypes=="O"]]
encoder = OrdinalEncoder_sklearn()
encoder.fit(df[categorical_columns])
df_encoded_3b = encoder.transform(df[categorical_columns])
df_encoded_3b = pd.DataFrame(df_encoded_3b)

In [9]:
encoder = OrdinalEncoder_feature_engine(encoding_method='arbitrary',variables=categorical_columns)
encoder.fit(df)
encoder.encoder_dict_
df_encoded_3c = encoder.transform(df)

### 4. Replacing categories with counts or frequency of observations

In [10]:
df_encoded_4a = pd.DataFrame()
count_map = df["A6"].value_counts().to_dict()
df_encoded_4a["A6_count"] = df["A6"].map(count_map)
frequency_map = (df["A6"].value_counts()/len(df)).to_dict()
df_encoded_4a["A6_frequency"] = df["A6"].map(frequency_map)

In [11]:
encoder1 = CountFrequencyEncoder_feature_engine(encoding_method='count',variables=None)
# encoder1 = CountFrequencyEncoder_feature_engine(encoding_method='frequency',variables=None)
encoder1.fit(df)
encoder1.encoder_dict_
df_encoded_4c = encoder1.transform(df)

### 5. Encoding with integers in an ordered manner

In [12]:
df_encoded_5a = pd.DataFrame()
ordered_labels = df.groupby("A6")["A15"].mean().sort_values().index
ordinal_mapping = {key:value for value, key in enumerate(ordered_labels,0)}
df_encoded_5a["A6"] = df["A6"].map(ordinal_mapping)

In [13]:
encoder = OrdinalEncoder_feature_engine(encoding_method="ordered", variables=None)
encoder.fit(df,df["A15"])
encoder.encoder_dict_
df_encoded_5c = encoder.transform(df)

### 6. Encoding with the mean of the target

In [14]:
df_encoded_6a = pd.DataFrame()
ordered_labels = df.groupby("A6")["A15"].mean().to_dict()
df_encoded_6a["A6"] = df["A6"].map(ordered_labels)

In [15]:
encoder = MeanEncoder_feature_engine(variables=None)
encoder.fit(df,df["A15"])
encoder.encoder_dict_
df_encoded_6c = encoder.transform(df)

### 7. Encoding with the Weight of Evidence

In [16]:
df_encoded_7a = pd.DataFrame()
p1 = df.groupby("A0")["A15"].mean()
p0 = 1 - p1
woe = dict(np.log(p1/p0))
df_encoded_7a["A0"] = df["A0"].map(woe)

In [17]:
encoder = WoEEncoder_feature_engine(variables = ["A0","A9","A11"])
encoder.fit(df,df["A15"])
encoder.encoder_dict_
df_encoded_7c = encoder.transform(df)

### 8. Grouping rare or infrequent categories

In [18]:
df_encoded_8a = pd.DataFrame()
temp = df["A6"].value_counts()/len(df)
frequent_categories = [category for category in temp.loc[temp>0.05].index.values]
df_encoded_8a["A6"] = np.where(df["A6"].isin(frequent_categories),df["A6"],"Rare")

In [19]:
encoder = RareLabelEncoder_feature_engine(tol=0.05,n_categories=4)
encoder.fit(df)
encoder.encoder_dict_
df_encoded_8c = encoder.transform(df)



### 9. Performing binary encoding


In [20]:
encoder = BinaryEncoder(cols=['A6'], drop_invariant=True)
encoder.fit(df)
df_encoded_9c = encoder.transform(df)

### 10. Performing feature hashing

In [21]:
encoder = HashingEncoder(cols=['A6'], n_components=4)
encoder.fit(df)
df_encoded_10c = encoder.transform(df)
