In [1]:
import pandas as pd

df = pd.read_csv('data/breast-cancer.csv')
df['breast-quad'] =  df['breast-quad'].fillna('unknown')

In [2]:
df['breast-quad']

0       left_up
1       central
2      left_low
3      left_low
4      right_up
         ...   
281    left_low
282    left_low
283    right_up
284    left_low
285    right_up
Name: breast-quad, Length: 286, dtype: object

### Another Idea

1. Generate columns/attributes/features as many as the number of distrinct values in encoded column/attribute/feature
2. Set only 1 relevant column/attribute/feature value to 1 and 0 others in the encoded domain

![title](images/ohe2.png)

In [15]:
pd.get_dummies(df[['breast-quad']])

Unnamed: 0,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,breast-quad_unknown
0,0,0,1,0,0,0
1,1,0,0,0,0,0
2,0,1,0,0,0,0
3,0,1,0,0,0,0
4,0,0,0,0,1,0
...,...,...,...,...,...,...
281,0,1,0,0,0,0
282,0,1,0,0,0,0
283,0,0,0,0,1,0
284,0,1,0,0,0,0


### Why ?
#### How similar/dissimilary each value in `breast-quad` wrt each other ?
* Note that the answer of question is mainly related with the encoding u use.

### Similarity/Dissimilarity each value in `breast-quad` wrt each other by `LabelEncoder` ?

In [4]:
unq_values = df['breast-quad'].unique()
unq_values

array(['left_up', 'central', 'left_low', 'right_up', 'right_low',
       'unknown'], dtype=object)

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le = LabelEncoder()

In [7]:
enc_le = le.fit(df['breast-quad']).transform(unq_values).reshape(-1,1)

In [8]:
enc_le

array([[2],
       [0],
       [1],
       [4],
       [3],
       [5]])

In [16]:
from  sklearn.metrics import pairwise_distances

In [17]:
pairwise_distances(enc_le.reshape(-1,1))

array([[0., 2., 1., 2., 1., 3.],
       [2., 0., 1., 4., 3., 5.],
       [1., 1., 0., 3., 2., 4.],
       [2., 4., 3., 0., 1., 1.],
       [1., 3., 2., 1., 0., 2.],
       [3., 5., 4., 1., 2., 0.]])

### Similarity/Dissimilarity each value in `breast-quad` wrt each other by our new encoding scheme (`OneHotEncoder`) ?

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
ohe= OneHotEncoder()

In [20]:
enc_ohe = ohe.fit(df[['breast-quad']]).transform(unq_values.reshape(-1,1)).toarray()

In [21]:
pairwise_distances(enc_ohe)

array([[0.        , 1.41421356, 1.41421356, 1.41421356, 1.41421356,
        1.41421356],
       [1.41421356, 0.        , 1.41421356, 1.41421356, 1.41421356,
        1.41421356],
       [1.41421356, 1.41421356, 0.        , 1.41421356, 1.41421356,
        1.41421356],
       [1.41421356, 1.41421356, 1.41421356, 0.        , 1.41421356,
        1.41421356],
       [1.41421356, 1.41421356, 1.41421356, 1.41421356, 0.        ,
        1.41421356],
       [1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
        0.        ]])