In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/breast-cancer.csv')

In [3]:
df.rename(columns={'Class': 'target'}, inplace=True)
df.fillna('unknown', inplace = True)

In [4]:
bin_cols = ['breast','irradiat']
nom_cols = ['breast-quad','menopause','node-caps']
ord_cols = ['age','tumor-size','inv-nodes','deg-malig']

In [5]:
df['breast'].replace({'right': 1, 'left': 0}, inplace=True)
df['irradiat'].replace({'yes': 1, 'no': 0}, inplace=True)
df['target'].replace({'recurrence-events': 1, 'no-recurrence-events': 0}, inplace=True)

In [6]:
y = df['target']

# One Hot Encoding

In this method, we map each category to a vector that contains 1 and 0 denoting the presence or absence of the feature. 

This method produces a lot of columns that slows down the learning significantly if the number of the category is very high for the feature.

![title](images/ohe2.png)

In [7]:
X_ohe_nom = df[nom_cols].copy()

X_ohe_nom.head()

Unnamed: 0,breast-quad,menopause,node-caps
0,left_up,premeno,yes
1,central,ge40,no
2,left_low,ge40,no
3,left_low,premeno,yes
4,right_up,premeno,yes


In [8]:
X_ohe_nom = pd.get_dummies(X_ohe_nom)

In [9]:
X_ohe_nom.shape

(286, 12)

In [10]:
X_ohe_nom.head()

Unnamed: 0,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,breast-quad_unknown,menopause_ge40,menopause_lt40,menopause_premeno,node-caps_no,node-caps_unknown,node-caps_yes
0,0,0,1,0,0,0,0,0,1,0,0,1
1,1,0,0,0,0,0,1,0,0,1,0,0
2,0,1,0,0,0,0,1,0,0,1,0,0
3,0,1,0,0,0,0,0,0,1,0,0,1
4,0,0,0,0,1,0,0,0,1,0,0,1


In [11]:
X_ohe_ord = df[ord_cols].copy()

X_ohe_ord.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig
0,40-49,15-19,0-2,3
1,50-59,15-19,0-2,1
2,50-59,35-39,0-2,2
3,40-49,35-39,0-2,3
4,40-49,30-34,3-5,2


In [12]:
df[ord_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   age         286 non-null    object
 1   tumor-size  286 non-null    object
 2   inv-nodes   286 non-null    object
 3   deg-malig   286 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 9.1+ KB


In [13]:
X_ohe_ord[['deg-malig']] = X_ohe_ord[['deg-malig']].astype('object')

In [14]:
X_ohe_ord = pd.get_dummies(X_ohe_ord)

In [15]:
X_ohe_ord.shape

(286, 27)

In [16]:
X_ohe_ord.head()

Unnamed: 0,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,tumor-size_0-4,tumor-size_10-14,tumor-size_15-19,tumor-size_20-24,...,inv-nodes_0-2,inv-nodes_12-14,inv-nodes_15-17,inv-nodes_24-26,inv-nodes_3-5,inv-nodes_6-8,inv-nodes_9-11,deg-malig_1,deg-malig_2,deg-malig_3
0,0,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


# Binary Encoding

Binary encoding converts a category into binary digits. Each binary digit creates one feature column.

Compared to One Hot Encoding, this will require fewer feature columns.

![title](images/binary.png)

In [17]:
import category_encoders as ce

In [18]:
X_binary = df[ord_cols].copy()

X_binary.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig
0,40-49,15-19,0-2,3
1,50-59,15-19,0-2,1
2,50-59,35-39,0-2,2
3,40-49,35-39,0-2,3
4,40-49,30-34,3-5,2


In [19]:
X_binary[['deg-malig']] = X_binary[['deg-malig']].astype('object')

In [20]:
binaryEnc = ce.BinaryEncoder()
X_binary = binaryEnc.fit_transform(X_binary)

In [21]:
X_binary.shape

(286, 16)

In [22]:
X_binary.head()

Unnamed: 0,age_0,age_1,age_2,age_3,tumor-size_0,tumor-size_1,tumor-size_2,tumor-size_3,tumor-size_4,inv-nodes_0,inv-nodes_1,inv-nodes_2,inv-nodes_3,deg-malig_0,deg-malig_1,deg-malig_2
0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1
1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0
2,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,1
3,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1
4,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,1


In [23]:
df['age'].nunique()

6

# BaseN Encoding

In [24]:
X_base3 = df[ord_cols].copy()

X_base3.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig
0,40-49,15-19,0-2,3
1,50-59,15-19,0-2,1
2,50-59,35-39,0-2,2
3,40-49,35-39,0-2,3
4,40-49,30-34,3-5,2


In [25]:
X_base3[['deg-malig']] = X_base3[['deg-malig']].astype('object')

In [26]:
base3Enc = ce.BaseNEncoder(base = 3)
X_base3 = base3Enc.fit_transform(X_base3)

In [27]:
X_base3.shape

(286, 12)

In [28]:
X_base3.head()

Unnamed: 0,age_0,age_1,age_2,tumor-size_0,tumor-size_1,tumor-size_2,tumor-size_3,inv-nodes_0,inv-nodes_1,inv-nodes_2,deg-malig_0,deg-malig_1
0,0,0,1,0,0,0,1,0,0,1,0,1
1,0,0,2,0,0,0,1,0,0,1,0,2
2,0,0,2,0,0,0,2,0,0,1,1,0
3,0,0,1,0,0,0,2,0,0,1,0,1
4,0,0,1,0,0,1,0,0,0,2,1,0


In [29]:
df['tumor-size'].nunique()

11

# Thermometer (Unary) Encoding

Thermometer encoding is like one-hot encoding, but it represents magnitude instead of a categorical variable.

![title](images/thermo3.png)

In [30]:
X_thermo = df[ord_cols].copy()

X_thermo.head()

Unnamed: 0,age,tumor-size,inv-nodes,deg-malig
0,40-49,15-19,0-2,3
1,50-59,15-19,0-2,1
2,50-59,35-39,0-2,2
3,40-49,35-39,0-2,3
4,40-49,30-34,3-5,2


In [31]:
from sklearn.base import TransformerMixin
from itertools import repeat
import scipy


class ThermometerEncoder(TransformerMixin):
    """
    Assumes all values are known at fit
    """
    def __init__(self, sort_key=None):
        self.sort_key = sort_key
        self.value_map_ = None
    
    def fit(self, X, y=None):
        self.value_map_ = {val: i for i, val in enumerate(sorted(X.unique(), key=self.sort_key))}
        return self
    
    def transform(self, X, y=None):
        values = X.map(self.value_map_)
        
        possible_values = sorted(self.value_map_.values())
        
        idx1 = []
        idx2 = []
        
        all_indices = np.arange(len(X))
        
        for idx, val in enumerate(possible_values[:-1]):
            new_idxs = all_indices[values > val]
            idx1.extend(new_idxs)
            idx2.extend(repeat(idx, len(new_idxs)))
            
        result = scipy.sparse.coo_matrix(([1] * len(idx1), (idx1, idx2)), shape=(len(X), len(possible_values)), dtype="int8")
            
        return result

In [32]:
thermos = []

for col in ord_cols:
    
    if col == 'age':
        sort_key = ['20-29', '30-39', '40-49', '50-59', '60-69', '70-79'].index
        
    elif col == 'tumor-size':
        sort_key = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54'].index
        
    elif col == 'inv-nodes':
        sort_key = ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '24-26'].index
        
    elif col == 'deg-malig':
        sort_key = int
        
    else:
        raise ValueError(col)
    
    thermoEnc = ThermometerEncoder(sort_key = sort_key)
    thermos.append(thermoEnc.fit_transform(X_thermo[col]))

thermo_ohc = scipy.sparse.hstack(thermos).tocsr()
thermo_ohc = scipy.sparse.csr_matrix(thermo_ohc).todense()
X_thermo = pd.DataFrame(thermo_ohc)

In [33]:
X_thermo.shape

(286, 27)

In [34]:
X_thermo.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,1,1,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,1,0
1,1,1,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,1,0,0
3,1,1,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,1,1,0
4,1,1,0,0,0,0,1,1,1,1,...,1,0,0,0,0,0,0,1,0,0


In [35]:
df['age'].nunique()

6

In [36]:
df['age'].head(10)

0    40-49
1    50-59
2    50-59
3    40-49
4    40-49
5    50-59
6    50-59
7    40-49
8    40-49
9    40-49
Name: age, dtype: object

In [37]:
X_thermo.iloc[:,:6].head(10)

Unnamed: 0,0,1,2,3,4,5
0,1,1,0,0,0,0
1,1,1,1,0,0,0
2,1,1,1,0,0,0
3,1,1,0,0,0,0
4,1,1,0,0,0,0
5,1,1,1,0,0,0
6,1,1,1,0,0,0
7,1,1,0,0,0,0
8,1,1,0,0,0,0
9,1,1,0,0,0,0


In [38]:
X_ohe_nom.to_csv('X_ohe_nom.csv', index=False)
X_ohe_ord.to_csv('X_ohe_ord.csv', index=False)
X_binary.to_csv('X_binary.csv', index=False)
X_base3.to_csv('X_base3.csv', index=False)
X_thermo.to_csv('X_thermo.csv', index=False)