## One Hot Encoding!
順序関係が成立しないデータに関してエンコーディングを行う


In [20]:
import pandas as pd

df = pd.read_csv('./data2.csv', index_col=0)
df

Unnamed: 0,size,color,price,classlabel
0,XL,red,1000,class1
1,L,blue,6000,class2
2,M,green,6000,class1
3,L,yellow,3000,class1
4,S,red,4000,class2
5,L,green,5000,class3
6,M,blue,7000,class1
7,XL,red,10000,class2
8,M,yellow,7000,class1
9,M,red,6000,class3


### 名義特徴量： color
はじめに整数値にエンコーディングする

In [21]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

color_encoder = encoder.fit(df['color'])
values = color_encoder.transform(df['color'])

values

array([2, 0, 1, 3, 2, 1, 0, 2, 3, 2])

In [22]:
## 1次元配列を2次元配列に変換
## 1をもとに残り（-1）を計算して出力
values.reshape(-1, 1)

array([[2],
       [0],
       [1],
       [3],
       [2],
       [1],
       [0],
       [2],
       [3],
       [2]])

In [23]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
onehotvalues = encoder.fit_transform(values.reshape(-1, 1))

onehotvalues.toarray()

array([[0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.]])

In [24]:
color_df = pd.DataFrame(onehotvalues.toarray(),
                        columns=sorted(list(set(df['color']))))
                        
color_df

Unnamed: 0,blue,green,red,yellow
0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,0.0,0.0,1.0,0.0
8,0.0,0.0,0.0,1.0
9,0.0,0.0,1.0,0.0


In [25]:
df = pd.merge(df, color_df, left_index=True, right_index=True, how='outer')
df

Unnamed: 0,size,color,price,classlabel,blue,green,red,yellow
0,XL,red,1000,class1,0.0,0.0,1.0,0.0
1,L,blue,6000,class2,1.0,0.0,0.0,0.0
2,M,green,6000,class1,0.0,1.0,0.0,0.0
3,L,yellow,3000,class1,0.0,0.0,0.0,1.0
4,S,red,4000,class2,0.0,0.0,1.0,0.0
5,L,green,5000,class3,0.0,1.0,0.0,0.0
6,M,blue,7000,class1,1.0,0.0,0.0,0.0
7,XL,red,10000,class2,0.0,0.0,1.0,0.0
8,M,yellow,7000,class1,0.0,0.0,0.0,1.0
9,M,red,6000,class3,0.0,0.0,1.0,0.0
