In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/breast-cancer.csv')

In [3]:
df.rename(columns={'Class': 'target'}, inplace=True)
df.fillna('unknown', inplace = True)

In [4]:
bin_cols = ['breast','irradiat']
nom_cols = ['breast-quad','menopause','node-caps']
ord_cols = ['age','tumor-size','inv-nodes','deg-malig']

In [5]:
df['breast'].replace({'right': 1, 'left': 0}, inplace=True)
df['irradiat'].replace({'yes': 1, 'no': 0}, inplace=True)
df['target'].replace({'recurrence-events': 1, 'no-recurrence-events': 0}, inplace=True)

In [6]:
y = df['target']

# Frequency Encoding

It is a way to utilize the frequency of the categories as labels.

In the cases where the frequency is related somewhat with the target variable, it helps the model to understand and assign the weight in direct and inverse proportion, depending on the nature of the data.

![title](images/freq.jpeg)

In [7]:
X_freq = df[nom_cols].copy()

X_freq.head()

Unnamed: 0,breast-quad,menopause,node-caps
0,left_up,premeno,yes
1,central,ge40,no
2,left_low,ge40,no
3,left_low,premeno,yes
4,right_up,premeno,yes


In [8]:
for col in X_freq.columns:
    freqEnc = (X_freq.groupby(col).size()) / len(X_freq)
    X_freq[col] = X_freq[col].apply(lambda x : freqEnc[x])

In [9]:
X_freq.head()

Unnamed: 0,breast-quad,menopause,node-caps
0,0.339161,0.524476,0.195804
1,0.073427,0.451049,0.776224
2,0.384615,0.451049,0.776224
3,0.384615,0.524476,0.195804
4,0.115385,0.524476,0.195804


# Target (Mean) Encoding

Target Encoding is similar to Label Encoding, except here labels are correlated directly with the target.

For example, in Target Encoding for each category in the feature label is decided with the mean value of the target variable on a training data.

This encoding method brings out the relation between similar categories.

![title](images/target.png)

In [10]:
X_target_nom = df[nom_cols+['target']].copy()

X_target_nom.head()

Unnamed: 0,breast-quad,menopause,node-caps,target
0,left_up,premeno,yes,1
1,central,ge40,no,0
2,left_low,ge40,no,1
3,left_low,premeno,yes,0
4,right_up,premeno,yes,1


In [11]:
for col in nom_cols:
    targetEnc = dict(X_target_nom.groupby(col)['target'].agg('sum')/X_target_nom.groupby(col)['target'].agg('count'))
    X_target_nom[col] = X_target_nom[col].replace(targetEnc).values

In [12]:
X_target_nom.drop(['target'], axis=1, inplace=True)

In [13]:
X_target_nom.head()

Unnamed: 0,breast-quad,menopause,node-caps
0,0.268041,0.32,0.553571
1,0.190476,0.271318,0.22973
2,0.318182,0.271318,0.22973
3,0.318182,0.32,0.553571
4,0.393939,0.32,0.553571


In [14]:
X_target_ord = df[ord_cols+['target']].copy()

X_target_ord.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig,target
0,40-49,15-19,0-2,3,1
1,50-59,15-19,0-2,1,0
2,50-59,35-39,0-2,2,1
3,40-49,35-39,0-2,3,0
4,40-49,30-34,3-5,2,1


In [15]:
X_target_ord[['deg-malig']] = X_target_ord[['deg-malig']].astype('object')

In [16]:
for col in ord_cols:
    targetEnc = dict(X_target_ord.groupby(col)['target'].agg('sum')/X_target_ord.groupby(col)['target'].agg('count'))
    X_target_ord[col] = X_target_ord[col].replace(targetEnc).values

In [17]:
X_target_ord.drop(['target'], axis=1, inplace=True)

In [18]:
X_target_ord.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig
0,0.3,0.233333,0.215962,0.529412
1,0.260417,0.233333,0.215962,0.169014
2,0.260417,0.368421,0.215962,0.215385
3,0.3,0.368421,0.215962,0.529412
4,0.3,0.416667,0.472222,0.215385


# K-Fold Target Encoding

K-Fold Target Encoding can be applied to reduce the overfitting.

![title](images/kfold.png)

In [19]:
from sklearn.model_selection import KFold

In [20]:
X_kfold_nom = df[nom_cols+['target']].copy()
X_kfold_nom.head()

Unnamed: 0,breast-quad,menopause,node-caps,target
0,left_up,premeno,yes,1
1,central,ge40,no,0
2,left_low,ge40,no,1
3,left_low,premeno,yes,0
4,right_up,premeno,yes,1


In [21]:
kf = KFold(n_splits = 5, shuffle = False)

for train_ind,val_ind in kf.split(X_kfold_nom):
    for col in nom_cols:
        if(X_kfold_nom[col].dtype == 'object') and 'tenc' not in col:
            replaced = dict(X_kfold_nom.iloc[train_ind][[col,'target']].groupby(col)['target'].mean())
            X_kfold_nom.loc[val_ind,f'tenc_{col}'] = X_kfold_nom.iloc[val_ind][col].replace(replaced).values

In [22]:
X_kfold_nom.drop(nom_cols+['target'], axis=1, inplace=True)

In [23]:
X_kfold_nom.head()

Unnamed: 0,tenc_breast-quad,tenc_menopause,tenc_node-caps
0,0.337838,0.327869,0.619048
1,0.2,0.306931,0.247253
2,0.3,0.306931,0.247253
3,0.3,0.327869,0.619048
4,0.37037,0.327869,0.619048


In [24]:
X_kfold_ord = df[ord_cols+['target']].copy()

X_kfold_ord.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig,target
0,40-49,15-19,0-2,3,1
1,50-59,15-19,0-2,1,0
2,50-59,35-39,0-2,2,1
3,40-49,35-39,0-2,3,0
4,40-49,30-34,3-5,2,1


In [25]:
X_kfold_ord[['deg-malig']] = X_kfold_ord[['deg-malig']].astype('object')

In [26]:
kf = KFold(n_splits = 5, shuffle = False)

for train_ind,val_ind in kf.split(X_kfold_ord):
    for col in ord_cols:
        if(X_kfold_ord[col].dtype == 'object') and 'tenc' not in col:
            replaced = dict(X_kfold_ord.iloc[train_ind][[col,'target']].groupby(col)['target'].mean())
            X_kfold_ord.loc[val_ind,f'tenc_{col}'] = X_kfold_ord.iloc[val_ind][col].replace(replaced).values

In [27]:
X_kfold_ord.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig,target,tenc_age,tenc_tumor-size,tenc_inv-nodes,tenc_deg-malig
0,40-49,15-19,0-2,3,1,0.305556,0.15,0.222222,0.603175
1,50-59,15-19,0-2,1,0,0.291667,0.15,0.222222,0.157895
2,50-59,35-39,0-2,2,1,0.291667,0.384615,0.222222,0.231481
3,40-49,35-39,0-2,3,0,0.305556,0.384615,0.222222,0.603175
4,40-49,30-34,3-5,2,1,0.305556,0.396226,0.535714,0.231481


In [28]:
X_kfold_ord.drop(ord_cols+['target'], axis=1, inplace=True)

In [29]:
X_kfold_ord.head()

Unnamed: 0,tenc_age,tenc_tumor-size,tenc_inv-nodes,tenc_deg-malig
0,0.305556,0.15,0.222222,0.603175
1,0.291667,0.15,0.222222,0.157895
2,0.291667,0.384615,0.222222,0.231481
3,0.305556,0.384615,0.222222,0.603175
4,0.305556,0.396226,0.535714,0.231481


# Weight of Evidence Encoding

Weight of Evidence (WoE) is a measure of the “strength” of a grouping technique to separate good and bad.

This method was developed primarily to build a predictive model to evaluate the risk of loan default in the credit and financial industry. 

Weight of evidence (WOE) is a measure of how much the evidence supports or undermines a hypothesis.

![title](images/woe3.jpg)

![title](images/woe.jpg)

WoE is well suited for Logistic Regression because the Logit transformation is simply the log of the odds, i.e., ln(P(Goods)/P(Bads)).

In [30]:
from mlencoders.weight_of_evidence_encoder import WeightOfEvidenceEncoder

In [31]:
X_woe_nom = df[nom_cols].copy()

X_woe_nom.head()

Unnamed: 0,breast-quad,menopause,node-caps
0,left_up,premeno,yes
1,central,ge40,no
2,left_low,ge40,no
3,left_low,premeno,yes
4,right_up,premeno,yes


In [32]:
weightEnc = WeightOfEvidenceEncoder()
X_woe_nom = weightEnc.fit_transform(X_woe_nom,y)

In [33]:
X_woe_nom.head()

Unnamed: 0,breast-quad,menopause,node-caps
0,-0.14393,0.106882,1.075765
1,-0.586265,-0.127293,-0.349184
2,0.098514,-0.127293,-0.349184
3,0.098514,0.106882,1.075765
4,0.429871,0.106882,1.075765


In [34]:
X_woe_ord = df[ord_cols].copy()

X_woe_ord.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig
0,40-49,15-19,0-2,3
1,50-59,15-19,0-2,1
2,50-59,35-39,0-2,2
3,40-49,35-39,0-2,3
4,40-49,30-34,3-5,2


In [35]:
X_woe_ord[['deg-malig']] = X_woe_ord[['deg-malig']].astype('object')

In [36]:
weightEnc = WeightOfEvidenceEncoder()
X_woe_ord = weightEnc.fit_transform(X_woe_ord,y)

In [37]:
X_woe_ord.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig
0,0.013356,-0.32893,-0.428699,0.978437
1,-0.18315,-0.32893,-0.428699,-0.731977
2,-0.18315,0.321657,-0.428699,-0.432115
3,0.013356,0.321657,-0.428699,0.978437
4,0.013356,0.524181,0.749428,-0.432115


In [38]:
X_label = pd.read_csv('X_label.csv')
X_ord = pd.read_csv('X_ord.csv')
X_ohe_nom = pd.read_csv('X_ohe_nom.csv')
X_ohe_ord = pd.read_csv('X_ohe_ord.csv')
X_binary = pd.read_csv('X_binary.csv')
X_base3 = pd.read_csv('X_base3.csv')
X_thermo = pd.read_csv('X_thermo.csv')

# Model

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

def logistic(X,y):
    
    model = LogisticRegression(C = 0.12345678987654321, solver = "lbfgs", max_iter = 5000, tol = 1e-2, n_jobs = 48)
    model.fit(X, y)
    score = cross_validate(model, X, y, cv=3, scoring="roc_auc")["test_score"].mean()
    print('AUC Score: ',f"{score:.6f}")

# Nominal Encoding Scores

In [40]:
logistic(X_label,y)

AUC Score:  0.614875


In [41]:
logistic(X_ohe_nom,y)

AUC Score:  0.621419


In [42]:
logistic(X_freq,y)

AUC Score:  0.643942


In [43]:
logistic(X_target_nom,y)

AUC Score:  0.662194


In [44]:
logistic(X_woe_nom,y)

AUC Score:  0.655975


# Ordinal Encoding Scores

In [45]:
logistic(X_ord,y)

AUC Score:  0.699422


In [46]:
logistic(X_ohe_ord,y)

AUC Score:  0.703704


In [47]:
logistic(X_binary,y)

AUC Score:  0.687848


In [48]:
logistic(X_base3,y)

AUC Score:  0.675257


In [49]:
logistic(X_thermo,y)

AUC Score:  0.717343


In [50]:
logistic(X_target_ord,y)

AUC Score:  0.744302


In [51]:
logistic(X_woe_ord,y)

AUC Score:  0.741208
