### 이산형 데이터 처리 --> One-Hot Encoding
- 각 값의 유무로 column을 만든다. 
- 예를 들어, A,B,C가 있다고 한다면 A column, B column, C column으로 column추가

In [1]:
edges = pd.DataFrame({'source' : [0,1,2],
                     'target' : [2,2,3],
                     'weight' : [3,4,5],
                     'color' : ['red','blue','blue']})

##### one-hot encoding : get_dummies

In [4]:
pd.get_dummies(edges) # object데이터만 one-hot encoding

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [5]:
pd.get_dummies(edges['color'])

Unnamed: 0,blue,red
0,0,1
1,1,0
2,1,0


In [6]:
edges[['color']]

Unnamed: 0,color
0,red
1,blue
2,blue


In [8]:
weight_dict = {3 : 'M', 4 : 'L', 5 : 'XL'}
edges['weight_sign'] = edges['weight'].map(weight_dict) # map함수 사용
edges

Unnamed: 0,color,source,target,weight,weight_sign
0,red,0,2,3,M
1,blue,1,2,4,L
2,blue,2,3,5,XL


In [10]:
edges = pd.get_dummies(edges)
edges.as_matrix()

array([[0, 2, 3, 0, 1, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1]], dtype=int64)

## Data Binning : 데이터의 구간을 나눈다.

In [42]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [43]:
# 데이터 구간을 먼저 설정
bins = [0,25,50,75,100] # 0~25, 25~50.....총 4 구간

# 구간명
group_names = ['Low', 'Okay', 'Good', 'Great']

# pd.cut
categories = pd.cut(df['postTestScore'], bins, labels=group_names)
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Good
Name: postTestScore, dtype: category
Categories (4, object): [Low < Okay < Good < Great]

- cutting된 걸 다시 one-hot encoding!!!!

In [44]:
df['categories'] = categories
pd.value_counts(df['categories'])

Good     8
Great    2
Low      2
Okay     0
Name: categories, dtype: int64

In [45]:
pd.get_dummies(df)

Unnamed: 0,preTestScore,postTestScore,regiment_Dragoons,regiment_Nighthawks,regiment_Scouts,company_1st,company_2nd,name_Ali,name_Cooze,name_Jacobson,...,name_Milner,name_Piger,name_Riani,name_Ryaner,name_Sloan,name_Sone,categories_Low,categories_Okay,categories_Good,categories_Great
0,4,25,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,24,94,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,31,57,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2,62,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,3,70,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5,4,25,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,24,94,1,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
7,31,57,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8,2,62,0,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
9,3,70,0,0,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


## Encoding by sklearn
- scikit-learn의 preprocessing패키지도 label, one-hot encoding 지원
- Label encode는 fit, transform의 과정으로 나눠져 있고,
- 새로운 데이터 입력시, 기존 labeling 규칙을 그대로 적용하기 위함
- fit은 규칙을 생성하는 과정
- transform은 규칙을 적용하는 과정
- 이렇게 생성된 labelencoder는 따로 저장하여 새로운 데이터 입력시 사용가능(pickle 화)

In [46]:
raw_example = df.as_matrix()
raw_example[:3]

array([['Nighthawks', '1st', 'Miller', 4, 25, 'Low'],
       ['Nighthawks', '1st', 'Jacobson', 24, 94, 'Great'],
       ['Nighthawks', '2nd', 'Ali', 31, 57, 'Good']], dtype=object)

In [47]:
data = raw_example.copy()

In [48]:
from sklearn import preprocessing

In [49]:
le = preprocessing.LabelEncoder() # Encoder생성
le.fit(raw_example[:,0]) # 데이터에 맞게 encoding fitting(기준확립)
# 예를 들면, A이면 0, B이면 1, C이면 2  이런식으로 fit()으로 결정
le.transform(raw_example[:,0]) # 바로 윗줄에서 fitting한 기준에 따라 전 데이터에 적용

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2], dtype=int64)

In [50]:
le.classes_ # Dragoons는 0, Nighthawks는 1, Scouts는 2 임을 알 수 있음.

array(['Dragoons', 'Nighthawks', 'Scouts'], dtype=object)

In [51]:
data[:,0] = le.transform(raw_example[:,0])
data[:3]

array([[1, '1st', 'Miller', 4, 25, 'Low'],
       [1, '1st', 'Jacobson', 24, 94, 'Great'],
       [1, '2nd', 'Ali', 31, 57, 'Good']], dtype=object)

In [52]:
label_column = [0,1,2,5]
label_enconder_list = []
for column_index in  label_column:
    le = preprocessing.LabelEncoder()
    le.fit(raw_example[:,column_index])
    data[:,column_index] = le.transform(raw_example[:,column_index])
    label_enconder_list.append(le)
    del le 
data[:3]

array([[1, 0, 4, 4, 25, 2],
       [1, 0, 2, 24, 94, 1],
       [1, 1, 0, 31, 57, 0]], dtype=object)

In [56]:
label_enconder_list[0].transform(raw_example[:10,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2], dtype=int64)

### sklearn의 one-hot encoder

In [60]:
one_hot_enc = preprocessing.OneHotEncoder()
one_hot_enc.fit(data[:,0].reshape(-1,1))

onehotlabels = one_hot_enc.transform(data[:,0].reshape(-1,1)).toarray()
onehotlabels

array([[ 0.,  1.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.]])

In [64]:
data[:,0].reshape(-1,1)

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [70]:
one_hot_enc = preprocessing.OneHotEncoder()
one_hot_enc.fit(raw_example[:,0].reshape(-1,1))

# onehotlabels = one_hot_enc.transform(raw_example[:,0].reshape(-1,1)).toarray()
# onehotlabels

ValueError: could not convert string to float: 'Scouts'

In [69]:
raw_example[:,0].reshape(-1,1)

array([['Nighthawks'],
       ['Nighthawks'],
       ['Nighthawks'],
       ['Nighthawks'],
       ['Dragoons'],
       ['Dragoons'],
       ['Dragoons'],
       ['Dragoons'],
       ['Scouts'],
       ['Scouts'],
       ['Scouts'],
       ['Scouts']], dtype=object)