# 类别数据转换

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,age,income,student,credit_rating,label
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_age,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes


## 手动
- map 函数

In [2]:
label_map = {'no': 0, 'yes': 1}
data2 = data.copy()
data2['label'] = data['label'].map(label_map)
data2.head()

Unnamed: 0,age,income,student,credit_rating,label
0,youth,high,no,fair,0
1,youth,high,no,excellent,0
2,middle_age,high,no,fair,1
3,senior,medium,no,fair,1
4,senior,low,yes,fair,1


## 自动
- pd.factorize

In [3]:
data3 = data.copy()
data3['lable'] = pd.factorize(data['label'])[0]
data3.head()

Unnamed: 0,age,income,student,credit_rating,label,lable
0,youth,high,no,fair,no,0
1,youth,high,no,excellent,no,0
2,middle_age,high,no,fair,yes,1
3,senior,medium,no,fair,yes,1
4,senior,low,yes,fair,yes,1


### 全部转换

In [7]:
data3_ = data.copy()
for column in data3_.columns:
    data3_[column] = pd.factorize(data3_[column])[0]
data3_.head()

Unnamed: 0,age,income,student,credit_rating,label
0,0,0,0,0,0
1,0,0,0,1,0
2,1,0,0,0,1
3,2,1,0,0,1
4,2,2,1,0,1


## 自动
- unique

In [4]:
data4 =data.copy()
data4['label'] = np.unique(data4['label'], return_inverse=True)[1]
data4.head()

Unnamed: 0,age,income,student,credit_rating,label
0,youth,high,no,fair,0
1,youth,high,no,excellent,0
2,middle_age,high,no,fair,1
3,senior,medium,no,fair,1
4,senior,low,yes,fair,1


### 全部转换

In [9]:
data4_ = data.copy()
for column in data4_.columns:
    data4_[column] = np.unique(data4_[column], return_inverse=True)[1]
data4_.head()

Unnamed: 0,age,income,student,credit_rating,label
0,2,0,0,1,0
1,2,0,0,0,0
2,0,0,0,1,1
3,1,2,0,1,1
4,1,1,1,1,1


## 把label 变成one hot 
- categorial

In [5]:
from statsmodels.tools import categorical

In [6]:
label = data['label'].values
label_one_hot = categorical(label, drop=True)
label_one_hot

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.]])