In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

## Label Encoding

In [2]:
import numpy as np
import pandas as pd
country = ['Taiwan','Australia','Ireland','Australia','Ireland','Taiwan']
age = [25,30,45,35,22,36]
salary = [20000,32000,59000,60000,43000,52000]
dic = {'Country':country,'Age':age,'Salary':salary}
data = pd.DataFrame(dic)
data

Unnamed: 0,Country,Age,Salary
0,Taiwan,25,20000
1,Australia,30,32000
2,Ireland,45,59000
3,Australia,35,60000
4,Ireland,22,43000
5,Taiwan,36,52000


In [3]:
labelencoder = LabelEncoder()
data_le = pd.DataFrame(dic)
data_le['Country'] = labelencoder.fit_transform(data_le['Country'])
data_le

Unnamed: 0,Country,Age,Salary
0,2,25,20000
1,0,30,32000
2,1,45,59000
3,0,35,60000
4,1,22,43000
5,2,36,52000


---

## One Hot Encoding

### `One Hot Encoding`

#### Do to text.
- Can't do to `string`
- Better use with `ColumnTransformer` as the warnings below

In [4]:
onehotencoder = OneHotEncoder(categorical_features = [0])
data_str_ohe = onehotencoder.fit_transform(data_le).toarray()
pd.DataFrame(data_str_ohe)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,25.0,20000.0
1,1.0,0.0,0.0,30.0,32000.0
2,0.0,1.0,0.0,45.0,59000.0
3,1.0,0.0,0.0,35.0,60000.0
4,0.0,1.0,0.0,22.0,43000.0
5,0.0,0.0,1.0,36.0,52000.0


- `categorical_features = [0]`: 表示欲在data上執行One hot encoding的index為0
- data_le: 為經過Label encoding編碼的資料(註:OneHotEncoder的輸入要為2-D array，而Label encoding為1-D array)
- OneHotEncoder會轉出scipy.csr_matrix資料結構用.toarray()轉array
<br><br>
> From the output above, we know '1' represents Australia, '2' represents Ireland, '3' represents Taiwan.

In [5]:
onehotencoder.categories_

[array([0., 1., 2.])]

#### Do to numerical data

In [6]:
onehotencoder = OneHotEncoder()
data_whole_ohe = onehotencoder.fit_transform(data).toarray()
pd.DataFrame(data_whole_ohe)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
onehotencoder.categories_

[array(['Australia', 'Ireland', 'Taiwan'], dtype=object),
 array([22, 25, 30, 35, 36, 45]),
 array([20000, 32000, 43000, 52000, 59000, 60000])]

### Use with `ColumnTransformer`
- Can deal with `string` now.
- **Recommend** to use `OneHotEncoder` with `ColumnTransformer`

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('can_i_type_anything', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(data)

`ColumnTransformer(transformers=[('can_i_type_anything', OneHotEncoder(), [0])]`

- `[0]` means to implement the `OneHotEncoder()` to the column which index=0
- `'can_i_type_anything'` its kust a name here, not matter to type anything here

In [9]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,25.0,20000.0
1,1.0,0.0,0.0,30.0,32000.0
2,0.0,1.0,0.0,45.0,59000.0
3,1.0,0.0,0.0,35.0,60000.0
4,0.0,1.0,0.0,22.0,43000.0
5,0.0,0.0,1.0,36.0,52000.0


---

### `pd.get_dummies`

- Can do to text, but not numerical data type.
- If not specify the column, then will implement on every columns. (which can be transfered.)

In [10]:
data_dum = pd.get_dummies(data)
pd.DataFrame(data_dum)

Unnamed: 0,Age,Salary,Country_Australia,Country_Ireland,Country_Taiwan
0,25,20000,0,0,1
1,30,32000,1,0,0
2,45,59000,0,1,0
3,35,60000,1,0,0
4,22,43000,0,1,0
5,36,52000,0,0,1
