### LabelEncoder
* from sklearn import preprocessing
* class sklearn.preprocessing.LabelEncoder
* fit(y)
   * Fit label encoder.
* fit_transform(y)
   * Fit label encoder and return encoded labels.
* get_params([deep]) : 
   * Get parameters for this estimator.
* inverse_transform(y)
   * Transform labels back to original encoding.
* set_params(**params)
   * Set the parameters of this estimator.
* transform(y)
   * Transform labels to normalized encoding.
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html?highlight=labelencoder#sklearn.preprocessing.LabelEncoder

In [4]:
from sklearn.preprocessing import LabelEncoder

items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

# LabelEncoder를 객체로 생성한 후 , fit( ) 과 transform( ) 으로 label 인코딩 수행. 
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값:',labels)
print('인코딩 클래스:',encoder.classes_)
print('디코딩 원본 값:',encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3]))

인코딩 변환값: [0 1 4 5 3 3 2 2]
인코딩 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']
디코딩 원본 값: ['전자렌지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


In [8]:
import pandas as pd

ti = pd.read_csv('C:/rcode/train.csv')

In [9]:
ti.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
ti = ti.drop(['PassengerId','Pclass','Name','SibSp','Parch','Ticket','Fare','Cabin'], axis=1)

In [11]:
ti.head(1)

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,male,22.0,S


In [12]:
ti.isna().sum()

Survived      0
Sex           0
Age         177
Embarked      2
dtype: int64

In [16]:
ti['Age'].mean().round(2)

29.7

In [18]:
ti['Age'] = ti['Age'].fillna(29.7)

In [20]:
item = ti['Embarked']

In [21]:
encoder = LabelEncoder()
encoder.fit(item)
labels = encoder.transform(item)

In [37]:
ti['Embarked'] = labels

In [36]:
ti['Embarked'] = ti['Embarked'].fillna('S')

In [38]:
sex = ti['Sex']

In [33]:
encoder = LabelEncoder()
encoder.fit(sex)
sexs = encoder.transform(sex)

In [34]:
ti['Sex'] = sexs

In [39]:
ti

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,1,22.0,2
1,1,0,38.0,0
2,1,0,26.0,2
3,1,0,35.0,2
4,0,1,35.0,2
...,...,...,...,...
886,0,1,27.0,2
887,1,0,19.0,2
888,0,0,29.7,2
889,1,1,26.0,0


### OneHotEncoder
* from sklearn.preprocessing import OneHotEncoder
* OneHotEncoder(*, categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None)
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html?highlight=onehotencoder#sklearn.preprocessing.OneHotEncoder

In [45]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

# 먼저 숫자값으로 변환을 위해 LabelEncoder로 변환합니다. 
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)

# 2차원 데이터로 변환합니다. 
labels = labels.reshape(-1,1)

# 원-핫 인코딩을 적용합니다. 
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
print('원-핫 인코딩 데이터')
print(oh_labels.toarray())
print('원-핫 인코딩 데이터 차원')
print(oh_labels.shape)

원-핫 인코딩 데이터
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
원-핫 인코딩 데이터 차원
(8, 6)


In [43]:
labels

array([[0],
       [1],
       [4],
       [5],
       [3],
       [3],
       [2],
       [2]])

### pandas.get_dummies
* pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [47]:
import pandas as pd

ti = pd.read_csv('C:/rcode/train.csv')
ti = ti.drop(['PassengerId','Pclass','Name','SibSp','Parch','Ticket','Fare','Cabin'], axis=1)
ti['Age'] = ti['Age'].fillna(ti['Age'].mean())
ti['Embarked'] = ti['Embarked'].fillna('S')

In [48]:
data = ti.loc[:,'Embarked']

In [49]:
data

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [50]:
dm_em = pd.get_dummies(data)

In [51]:
dm_em

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [52]:
dm_data = pd.get_dummies(ti[['Sex','Embarked']])

In [53]:
dm_data

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1
...,...,...,...,...,...
886,0,1,0,0,1
887,1,0,0,0,1
888,1,0,0,0,1
889,0,1,1,0,0


In [57]:
td_data = pd.concat([ti,dm_data], axis=1)

In [58]:
td_data

Unnamed: 0,Survived,Sex,Age,Embarked,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,male,22.000000,S,0,1,0,0,1
1,1,female,38.000000,C,1,0,1,0,0
2,1,female,26.000000,S,1,0,0,0,1
3,1,female,35.000000,S,1,0,0,0,1
4,0,male,35.000000,S,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
886,0,male,27.000000,S,0,1,0,0,1
887,1,female,19.000000,S,1,0,0,0,1
888,0,female,29.699118,S,1,0,0,0,1
889,1,male,26.000000,C,0,1,1,0,0


In [59]:
td_data = td_data.drop(['Sex','Embarked'], axis=1)

In [60]:
td_data

Unnamed: 0,Survived,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.000000,0,1,0,0,1
1,1,38.000000,1,0,1,0,0
2,1,26.000000,1,0,0,0,1
3,1,35.000000,1,0,0,0,1
4,0,35.000000,0,1,0,0,1
...,...,...,...,...,...,...,...
886,0,27.000000,0,1,0,0,1
887,1,19.000000,1,0,0,0,1
888,0,29.699118,1,0,0,0,1
889,1,26.000000,0,1,1,0,0
