# Introduction
---

__Overview__

Assessing performance of various representations of categorical variables.


__Data__

Using the _Banknote Dataset_ referenced on this page listing [standard
machine learning
datasets](https://machinelearningmastery.com/standard-machine-learning-datasets/).

__License__

MIT. License does not necessarily cover the dataset; please refer to [ics](https://archive.ics.uci.edu/ml/datasets/banknote+authentication) for more details.


# Dependencies
---

In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

from sklearn.linear_model import LogisticRegression

plt.rcParams["figure.figsize"] = 10, 6

# Data Ingestion
---

First, data is read in and transformed. The purpose of this exercise is to analyze the mapping of categorical features to a binary response variable. In turn, continuous variables will be dichotomized. Regarding potential performance differences between the original continuous representations and the binned versions of these features, there is no relevance; the scope of this analysis prescribes interpreting the output of this section as the singular presentation of the data.

In [91]:
data = pd.read_csv('data_banknote_authentication.txt', header=None)
data.columns = ['x1', 'x2', 'x3', 'x4', 'inauthentic']

print(data.dtypes)
print('\n\n')

data.head()

x1             float64
x2             float64
x3             float64
x4             float64
inauthentic      int64
dtype: object





Unnamed: 0,x1,x2,x3,x4,inauthentic
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [94]:
def bin_variable(df, colname, min_pct=0.05, max_k=10):
    levels = string.ascii_uppercase
    assert max_k <= len(levels), f"max_k must be <= {len(levels)}"
    
    df = df.copy()
    
    for k in range(max_k, 1, -1):
        df[f"{colname}_Bin"] = pd.cut(df[colname], k)
        
        if df[f"{colname}_Bin"].value_counts().min()/df.shape[0] >= min_pct:
            break
    
    unique = df[f"{colname}_Bin"].unique()
    m = {}
    for u in unique:
        m[u] = levels[len(m)]
    
    return df[f"{colname}_Bin"].map(m).values

for x in range(1, 5):
    data[f"c{x}"] = bin_variable(data, f"x{x}", 0.05, 10)

In [172]:
data = data[['c1', 'c2', 'c3', 'c4', 'inauthentic']].sample(frac=1.00)
data.head()

Unnamed: 0,c1,c2,c3,c4,inauthentic
664,A,C,A,A,0
1126,C,D,B,A,1
180,B,C,A,A,0
1066,C,C,A,A,1
711,B,B,B,B,0


In [119]:
PCT_TRAINING = 2/3
tr_ind = range(0, int(PCT_TRAINING*data.shape[0]))
ts_ind = range(len(tr_ind), data.shape[0])

assert(len(tr_ind)+len(ts_ind)==data.shape[0])

tr = data.iloc[tr_ind].copy()
ts = data.iloc[ts_ind].copy()

# Training
---

All methods will be modeled in an identical fashion.

In [336]:
def fit_model(x, y):
    fit = LogisticRegression(penalty='none').fit(x, y)
    return fit

def predict(fit, x):
    return 1 / (1 + np.exp(-(fit.intercept_ + np.dot(fit.coef_, np.transpose(x)))))

def crossentropy(y, yh, e=1e-16):
    yh = np.minimum(1-e, np.maximum(yh, e))
    return -np.mean( y*np.log(yh) + (1-y)*np.log(1-yh) ) 

In [193]:
xcols = ['c1', 'c2', 'c3', 'c4']
ycol = 'inauthentic'
y_tr = tr[[ycol]].values.reshape(-1)
y_ts = ts[[ycol]].values.reshape(-1)

## One-Hot Encoding

In [338]:
def onehot(x):
    return pd.get_dummies(x).values

tr_onehot = onehot(tr[xcols])
ts_onehot = onehot(ts[xcols])
fit_onehot = fit_model(tr_onehot, y_tr)

yh_tr_onehot = predict(fit_onehot, tr_onehot)
yh_ts_onehot = predict(fit_onehot, ts_onehot)

In [339]:
tr_onehot.sum()

3656

In [340]:
fit_onehot.coef_

array([[ 54.29372465,  -3.15060474,  -7.76126349, -38.31721611,
         41.49912652,   3.49386379,   3.41657603, -43.34492604,
          4.6127717 ,   0.45186861,  15.70838172,  -4.30288617,
         -6.34085525]])

In [341]:
yh_tr_onehot.sum()

401.00121467572563

## Mean Encoding

In [342]:
def mean_encode(tr, xc, yc):
    lookups = {}
    for x in xc:
        mucol = f"{x}_Mean"
        tab = tr.groupby(x).aggregate({yc: ['sum', 'count']}).reset_index()
        lookup = pd.DataFrame({x: tab.iloc[:, 0], mucol: tab[yc]['sum']/tab[yc]['count']})
        lookups[x] = lookup
    return lookups

def apply_mean_encode(df, xc, lookups):
    mu_cols = []
    for x in xc:
        mucol = f"{x}_Mean"
        mu_cols.append(mucol)
        lookup = lookups[x]
        df = df.merge(lookup, on=x, how='inner')
    return df[mu_cols].values

mean_lookups = mean_encode(tr, xcols, ycol)
tr_mean = apply_mean_encode(tr, xcols, mean_lookups)
ts_mean = apply_mean_encode(ts, xcols, mean_lookups)
fit_mean = fit_model(tr_mean, y_tr)

yh_tr_mean = predict(fit_mean, tr_mean)
yh_ts_mean = predict(fit_mean, ts_mean)

In [343]:
tr_mean.sum()

1604.0

In [344]:
mean_lookups

{'c1':   c1   c1_Mean
 0  D  1.000000
 1  C  0.795527
 2  B  0.233533
 3  A  0.000000,
 'c2':   c2   c2_Mean
 0  D  1.000000
 1  B  0.504950
 2  C  0.532134
 3  A  0.083333,
 'c3':   c3   c3_Mean
 0  A  0.394670
 1  B  0.714286,
 'c4':   c4   c4_Mean
 0  C  0.440678
 1  B  0.463415
 2  A  0.426056}

## Log of Odds Ratio

In [345]:
def logodds_encode(x_mean, e=1e-16):
    x_mean = np.minimum(1-e, np.maximum(x_mean, e))
    return np.log(x_mean / (1 - x_mean))

tr_logodds = logodds_encode(tr_mean)
ts_logodds = logodds_encode(ts_mean)
fit_logodds = fit_model(tr_logodds, y_tr)

yh_tr_logodds = predict(fit_logodds, tr_logodds)
yh_ts_logodds = predict(fit_logodds, ts_logodds)

In [346]:
tr_logodds.sum()

-2752.439399568661

## Univariate

In [358]:
def univariate_encode(tr, xc, yc):
    models = {}
    for x in xc:
        models[x] = fit_model(onehot(tr[[x]]), tr[yc].values.reshape(-1))
    return models

def apply_univariate_encode(df, xc, models):
    uni_cols = []
    for x in xc:
        uni_col = f"{x}_Univariate"
        uni_cols.append(uni_col)
        df[uni_col] = predict(models[x], onehot(df[[x]])).reshape(-1)
    return df[uni_cols].values

univariate_lookups = univariate_encode(tr, xcols, ycol)
tr_uni = apply_univariate_encode(tr, xcols, univariate_lookups)
ts_uni = apply_univariate_encode(ts, xcols, univariate_lookups)
fit_uni = fit_model(tr_uni, y_tr)

yh_tr_uni = predict(fit_uni, tr_uni)
yh_ts_uni = predict(fit_uni, ts_uni)

In [359]:
tr_uni.sum()

1603.9966138167613

In [360]:
for c, m in univariate_lookups.items():
    print(c)
    print(m.coef_)

c1
[[ 18.75957151   1.12637508  -1.42080583 -18.23289686]]
c2
[[14.98375397 -3.16382459 -3.05484277 -5.5815053 ]]
c3
[[-0.59057923  0.75343491]]
c4
[[-0.06766709  0.02413984 -0.12721621]]


In [361]:
fit_uni.coef_

array([[ 12.90815244,  20.66029496, -19.69398126,  49.23250191]])

# Evaluate Results
---

In [362]:
print(f"Onehot train: {crossentropy(y_tr, yh_tr_onehot)}")
print(f"Onehot test: {crossentropy(y_ts, yh_ts_onehot)}")

Onehot train: 0.16473058244222402
Onehot test: 0.1897501815937788


In [363]:
print(f"Mean train: {crossentropy(y_tr, yh_tr_mean)}")
print(f"Mean test: {crossentropy(y_ts, yh_ts_mean)}")

Mean train: 0.6842245080437815
Mean test: 0.6913162371336383


In [364]:
print(f"Odds train: {crossentropy(y_tr, yh_tr_logodds)}")
print(f"Odds test: {crossentropy(y_ts, yh_ts_logodds)}")

Odds train: 0.6840893769764325
Odds test: 0.6927877086293894


In [365]:
print(f"Uni train: {crossentropy(y_tr, yh_tr_uni)}")
print(f"Uni test: {crossentropy(y_ts, yh_ts_uni)}")

Uni train: 0.20134518458733788
Uni test: 0.20013838104776552
