# Feature Engineering of the categorical columns

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import *
from feature_engine.encoding import CountFrequencyEncoder
from matplotlib.lines import Line2D
from pylab import rcParams
rcParams['figure.figsize'] = 15,15

In [2]:
!ls ./csv_files/

test  train


In [5]:
categorical_columns = pd.read_csv("./csv_files/train/categorical_columns.csv", index_col="Unnamed: 0")
train = pd.read_csv("./csv_files/train/train.csv")

In [6]:
categorical_columns

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
0,1,A,B,A,A,B,D,A,E,C,I
1,2,B,A,A,A,B,B,A,E,A,F
2,3,A,A,A,C,B,D,A,B,C,N
3,4,A,A,A,C,B,D,A,E,G,K
4,6,A,B,A,A,B,B,A,E,C,F
...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,A,B,A,C,B,B,A,E,E,L
299996,499996,A,B,A,C,B,B,A,E,E,L
299997,499997,A,B,A,C,B,B,A,E,C,M
299998,499998,A,B,B,C,B,B,A,D,E,F


#### Ordinal Encoder:

In [7]:
enc = OrdinalEncoder()
X = categorical_columns
enc.fit(X)
ordinal_categorical_columns = enc.transform(categorical_columns)
ordinal_categorical_columns = pd.DataFrame(ordinal_categorical_columns)
ordinal_categorical_columns

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
0,1,1,1,1,1,1,1,1,1,1,1
1,2,2,2,1,1,1,2,1,1,2,2
2,3,1,2,1,2,1,1,1,2,1,3
3,4,1,2,1,2,1,1,1,1,3,4
4,6,1,1,1,1,1,2,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,1,1,1,2,1,2,1,1,4,6
299996,499996,1,1,1,2,1,2,1,1,4,6
299997,499997,1,1,1,2,1,2,1,1,1,12
299998,499998,1,1,2,2,1,2,1,3,4,2


#### One-Hot Encoding:

In [9]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(categorical_columns)
onehot_categorical_columns = enc.transform(categorical_columns)
onehot_categorical_columns

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,id,cat0_1,cat0_2,cat1_1,cat1_2,cat2_1,cat2_2,cat3_1,cat3_2,cat3_3,...,cat9_6,cat9_7,cat9_8,cat9_9,cat9_10,cat9_11,cat9_12,cat9_13,cat9_14,cat9_15
0,1,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,6,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,1,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
299996,499996,1,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
299997,499997,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
299998,499998,1,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Binary Encoding:

In [11]:
enc = BinaryEncoder().fit(categorical_columns)
binary_categorical_columns = enc.transform(categorical_columns)
binary_categorical_columns

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,id,cat0_0,cat0_1,cat1_0,cat1_1,cat2_0,cat2_1,cat3_0,cat3_1,cat3_2,...,cat7_3,cat8_0,cat8_1,cat8_2,cat8_3,cat9_0,cat9_1,cat9_2,cat9_3,cat9_4
0,1,0,1,0,1,0,1,0,0,1,...,1,0,0,0,1,0,0,0,0,1
1,2,1,0,1,0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
2,3,0,1,1,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,1,1
3,4,0,1,1,0,0,1,0,1,0,...,1,0,0,1,1,0,0,1,0,0
4,6,0,1,0,1,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,0,1,0,1,0,1,0,1,0,...,1,0,1,0,0,0,0,1,1,0
299996,499996,0,1,0,1,0,1,0,1,0,...,1,0,1,0,0,0,0,1,1,0
299997,499997,0,1,0,1,0,1,0,1,0,...,1,0,0,0,1,0,1,1,0,0
299998,499998,0,1,0,1,1,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0


### Frequency encoding

In [13]:
encoder = CountFrequencyEncoder(encoding_method='frequency')
encoder.fit(categorical_columns)
freq_categorical_columns = encoder.transform(categorical_columns)
freq_categorical_columns

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
0,1,0.938237,0.45774,0.921837,0.348213,0.991243,0.450503,0.975477,0.892103,0.403513,0.166880
1,2,0.061763,0.54226,0.921837,0.348213,0.991243,0.497360,0.975477,0.892103,0.126260,0.357603
2,3,0.938237,0.54226,0.921837,0.612507,0.991243,0.450503,0.975477,0.019167,0.403513,0.013707
3,4,0.938237,0.54226,0.921837,0.612507,0.991243,0.450503,0.975477,0.892103,0.140650,0.069850
4,6,0.938237,0.45774,0.921837,0.348213,0.991243,0.497360,0.975477,0.892103,0.403513,0.357603
...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,0.938237,0.45774,0.921837,0.612507,0.991243,0.497360,0.975477,0.892103,0.315387,0.140667
299996,499996,0.938237,0.45774,0.921837,0.612507,0.991243,0.497360,0.975477,0.892103,0.315387,0.140667
299997,499997,0.938237,0.45774,0.921837,0.612507,0.991243,0.497360,0.975477,0.892103,0.403513,0.032793
299998,499998,0.938237,0.45774,0.078163,0.612507,0.991243,0.497360,0.975477,0.081187,0.315387,0.357603


In [14]:
freq_categorical_columns.to_csv("freq_categorical_columns.csv")

### Point for next step - modelling:
- Try each different encoded set for the categorical_columns.
- See which one gives best result in modelling.