In [1]:
# NIRMALYA_THAKURTA_2052_DM_LAB_14
#Categorical Encoading Assignment 8-Comparison-categorical-encoding-techniques

In [233]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pandas.api.types import is_numeric_dtype
from sklearn.metrics import roc_auc_score
from feature_engine.encoding import *

In [234]:
# let's load the titanic dataset

# we will only use these columns in the demo
cols = ['pclass', 'age', 'sibsp', 'parch', 'fare',
        'sex', 'cabin', 'embarked', 'survived']

data = pd.read_csv('titanic.csv', usecols=cols)

data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,female,29.0,0,0,211.3375,B5,S
1,1,1,male,0.9167,1,2,151.55,C22,S
2,1,0,female,2.0,1,2,151.55,C22,S
3,1,0,male,30.0,1,2,151.55,C22,S
4,1,0,female,25.0,1,2,151.55,C22,S


In [235]:
# let's check for missing data

data.isnull().sum()

pclass         0
survived       0
sex            0
age          263
sibsp          0
parch          0
fare           1
cabin       1014
embarked       2
dtype: int64

In [236]:
# Drop observations with NA in Fare and embarked
data.dropna(axis=0,how='any',inplace=True,subset=['fare','embarked','age','cabin'])

In [237]:
data.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
cabin       0
embarked    0
dtype: int64

In [238]:
# Now we extract the first letter of the cabin
data['cabin'] = data['cabin'].str[0]
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,female,29.0,0,0,211.3375,B,S
1,1,1,male,0.9167,1,2,151.55,C,S
2,1,0,female,2.0,1,2,151.55,C,S
3,1,0,male,30.0,1,2,151.55,C,S
4,1,0,female,25.0,1,2,151.55,C,S


In [239]:
# drop observations with cabin = T, they are too few
data=data[data.cabin!='T']

In [240]:
# Let's divide into train and test set
X_train,X_test,Y_train,Y_test=train_test_split(data.loc[:,data.columns!='survived'],data.loc[:,data.columns=='survived'],test_size=0.3)
X_train.shape, X_test.shape

((188, 8), (81, 8))

In [241]:
# Let's replace null values in numerical variables by the mean
for i in X_train:
    if is_numeric_dtype(X_train[i])==True:
        
        X_train[i].fillna(X_train[i].mean(),inplace=True)
        X_test[i].fillna(X_test[i].mean(),inplace=True)

In [242]:
data['cabin'].unique()

array(['B', 'C', 'E', 'D', 'A', 'F', 'G'], dtype=object)

In [243]:
# let's check that we have no missing data after NA imputation
data.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
cabin       0
embarked    0
dtype: int64

In [244]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
47,1,male,42.0,0,0,26.2875,E,S
188,1,female,51.0,0,1,39.4000,D,S
268,1,male,24.0,1,0,60.0000,C,S
485,2,male,36.0,0,0,12.8750,D,C
294,1,male,49.0,1,1,110.8833,C,C
...,...,...,...,...,...,...,...,...
1189,3,female,4.0,1,1,16.7000,G,S
136,1,male,53.0,0,0,28.5000,C,C
301,1,male,47.0,0,0,34.0208,D,S
148,1,male,45.0,1,0,83.4750,C,S


# One Hot Encoding

In [245]:
X_train_OHE=OneHotEncoder().fit(X_train,Y_train).transform(X_train)

X_train_OHE.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,sex_female,cabin_E,cabin_D,cabin_C,cabin_B,cabin_A,cabin_G,cabin_F,embarked_S,embarked_C,embarked_Q
47,1,42.0,0,0,26.2875,1,0,1,0,0,0,0,0,0,1,0,0
188,1,51.0,0,1,39.4,0,1,0,1,0,0,0,0,0,1,0,0
268,1,24.0,1,0,60.0,1,0,0,0,1,0,0,0,0,1,0,0
485,2,36.0,0,0,12.875,1,0,0,1,0,0,0,0,0,0,1,0
294,1,49.0,1,1,110.8833,1,0,0,0,1,0,0,0,0,0,1,0


In [246]:
X_test_OHE=OneHotEncoder().fit(X_train,Y_train).transform(X_test)

X_test_OHE.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,sex_female,cabin_E,cabin_D,cabin_C,cabin_B,cabin_A,cabin_G,cabin_F,embarked_S,embarked_C,embarked_Q
112,1,28.0,3,2,263.0,0,1,0,0,1,0,0,0,0,1,0,0
22,1,26.0,0,0,30.0,1,0,0,0,1,0,0,0,0,0,1,0
16,1,24.0,0,1,247.5208,1,0,0,0,0,1,0,0,0,0,1,0
3,1,30.0,1,2,151.55,1,0,0,0,1,0,0,0,0,1,0,0
285,1,67.0,1,0,221.7792,1,0,0,0,1,0,0,0,0,1,0,0


# Count Encoding

In [247]:
X_train_count=CountFrequencyEncoder().fit(X_train,Y_train).transform(X_train)

X_train_count.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
47,1,90,42.0,0,0,26.2875,27,108
188,1,98,51.0,0,1,39.4,31,108
268,1,90,24.0,1,0,60.0,57,108
485,2,90,36.0,0,0,12.875,31,77
294,1,90,49.0,1,1,110.8833,57,77


In [248]:
X_test_count=CountFrequencyEncoder().fit(X_train,Y_train).transform(X_test)

X_test_count.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
112,1,98,28.0,3,2,263.0,57,108
22,1,90,26.0,0,0,30.0,57,77
16,1,90,24.0,0,1,247.5208,44,77
3,1,90,30.0,1,2,151.55,57,108
285,1,90,67.0,1,0,221.7792,57,108


# Ordered Encoding

In [249]:
X_train_ordered=OrdinalEncoder().fit(X_train,Y_train).transform(X_train)

X_train_ordered.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
47,1,0,42.0,0,0,26.2875,5,0
188,1,1,51.0,0,1,39.4,6,0
268,1,0,24.0,1,0,60.0,3,0
485,2,0,36.0,0,0,12.875,6,2
294,1,0,49.0,1,1,110.8833,3,2


In [250]:
X_test_ordered=OrdinalEncoder().fit(X_train,Y_train).transform(X_test)

X_test_ordered.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
112,1,1,28.0,3,2,263.0,3,0
22,1,0,26.0,0,0,30.0,3,2
16,1,0,24.0,0,1,247.5208,4,2
3,1,0,30.0,1,2,151.55,3,0
285,1,0,67.0,1,0,221.7792,3,0


# Mean Encoding

In [276]:
X_train_mean=MeanEncoder().fit(X_train,Y_train).transform(X_train)

X_train_mean.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
47,1,0.433333,42.0,0,0,26.2875,0.740741,0.648148
188,1,0.938776,51.0,0,1,39.4,0.774194,0.648148
268,1,0.433333,24.0,1,0,60.0,0.684211,0.648148
485,2,0.433333,36.0,0,0,12.875,0.774194,0.766234
294,1,0.433333,49.0,1,1,110.8833,0.684211,0.766234


In [261]:
X_test_mean=MeanEncoder().fit(X_train,Y_train).transform(X_test)

X_test_mean.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
112,1,0.938776,28.0,3,2,263.0,0.684211,0.648148
22,1,0.433333,26.0,0,0,30.0,0.684211,0.766234
16,1,0.433333,24.0,0,1,247.5208,0.704545,0.766234
3,1,0.433333,30.0,1,2,151.55,0.684211,0.648148
285,1,0.433333,67.0,1,0,221.7792,0.684211,0.648148


# Probability Ratio

In [262]:
X_train_ratio=PRatioEncoder(encoding_method='ratio').fit(X_train,Y_train['survived']).transform(X_train)
X_test_ratio=PRatioEncoder(encoding_method='ratio').fit(X_train,Y_train['survived']).transform(X_test)
X_train_ratio.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
47,1,0.764706,42.0,0,0,26.2875,2.857143,1.842105
188,1,15.333333,51.0,0,1,39.4,3.428571,1.842105
268,1,0.764706,24.0,1,0,60.0,2.166667,1.842105
485,2,0.764706,36.0,0,0,12.875,3.428571,3.277778
294,1,0.764706,49.0,1,1,110.8833,2.166667,3.277778


# Random Forest Performance

In [263]:
# create a function to build random forests (n_estimators=50, random_state=39, max_depth=3) and compare performance in train and test set
def run_randomForests(X_train,X_test,Y_train,Y_test):
    rfc = RandomForestClassifier(n_estimators=50, random_state=39, max_depth=3)
    rfc.fit(X_train,Y_train['survived'])
    print("Train set")
    print("Random Forests roc-auc:",roc_auc_score(Y_train,rfc.predict(X_train)))
    print("Test set")
    print("Random Forests roc-auc:",roc_auc_score(Y_test,rfc.predict(X_test)))


In [264]:
X_train_OHE

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,sex_female,cabin_E,cabin_D,cabin_C,cabin_B,cabin_A,cabin_G,cabin_F,embarked_S,embarked_C,embarked_Q
47,1,42.0,0,0,26.2875,1,0,1,0,0,0,0,0,0,1,0,0
188,1,51.0,0,1,39.4000,0,1,0,1,0,0,0,0,0,1,0,0
268,1,24.0,1,0,60.0000,1,0,0,0,1,0,0,0,0,1,0,0
485,2,36.0,0,0,12.8750,1,0,0,1,0,0,0,0,0,0,1,0
294,1,49.0,1,1,110.8833,1,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189,3,4.0,1,1,16.7000,0,1,0,0,0,0,0,1,0,1,0,0
136,1,53.0,0,0,28.5000,1,0,0,0,1,0,0,0,0,0,1,0
301,1,47.0,0,0,34.0208,1,0,0,1,0,0,0,0,0,1,0,0
148,1,45.0,1,0,83.4750,1,0,0,0,1,0,0,0,0,1,0,0


In [265]:
# OHE
run_randomForests(X_train_OHE, X_test_OHE, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.8298513459220571
Test set
Random Forests roc-auc: 0.7158801020408163


In [266]:
# counts
run_randomForests(X_train_count, X_test_count, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7299450917369761
Test set
Random Forests roc-auc: 0.6214923469387755


In [267]:
# ordered labels
run_randomForests(X_train_ordered, X_test_ordered, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7501674032409267
Test set
Random Forests roc-auc: 0.6004464285714286


In [268]:
# mean encoding
run_randomForests(X_train_mean, X_test_mean, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7501674032409267
Test set
Random Forests roc-auc: 0.6004464285714286


In [269]:
# ratio
run_randomForests(X_train_ratio, X_test_ratio, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7501674032409267
Test set
Random Forests roc-auc: 0.6004464285714286


# Logistic Regression Performance

In [270]:
# create a function for Logistic Regression
def run_logistic(X_train,X_test,Y_train,Y_test):
    rfc = LogisticRegression()
    rfc.fit(X_train,Y_train['survived'])
    print("Train set")
    print("Random Forests roc-auc:",roc_auc_score(Y_train,rfc.predict(X_train)))
    print("Test set")
    print("Random Forests roc-auc:",roc_auc_score(Y_test,rfc.predict(X_test)))


In [271]:
# OHE
run_logistic(X_train_OHE, X_test_OHE, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7772197669746886
Test set
Random Forests roc-auc: 0.7729591836734694


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [272]:
# counts
run_logistic(X_train_count, X_test_count, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7070443283781974
Test set
Random Forests roc-auc: 0.7362882653061225


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [273]:
# ordered labels
run_logistic(X_train_ordered, X_test_ordered, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7997187625552432
Test set
Random Forests roc-auc: 0.7104591836734693


In [274]:
# mean encoding
run_logistic(X_train_mean, X_test_mean, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7234498459890183
Test set
Random Forests roc-auc: 0.6683673469387755


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [275]:
# ratio
run_logistic(X_train_ratio, X_test_ratio, Y_train, Y_test)

Train set
Random Forests roc-auc: 0.7756796571581626
Test set
Random Forests roc-auc: 0.7471301020408163


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
