### sklearn.preprocessing.LabelEncoder

* Encode target labels with value between 0 and n_classes-1.

This transformer should be used to encode target values, i.e. y, and not the input X.

* attributes : classes_ : ndarray of shape(n_classes,)

* methods
    - fit : encoding할 label지정
    - fit_transform : label encoding 실행 fit과 transform을 같이 실행
    - get_params 
    - inverse_transform : label encoding된 값을 다시 class로 나타냄
    - set_params
    - transfrom : fit 후 transform 실행

In [14]:
from sklearn.preprocessing import LabelEncoder


items = ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

## LabelEncoder 를 객체로 생성한 후, fit() 과 transform()으로 label인코딩 수행.

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값:',labels)
print('인코딩 클래스:',encoder.classes_)
print('디코딩 원본 값:',encoder.inverse_transform([0, 1, 4, 5, 3, 3, 2, 2]))

인코딩 변환값: [0 1 4 5 3 3 2 2]
인코딩 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']
디코딩 원본 값: ['TV' '냉장고' '전자렌지' '컴퓨터' '선풍기' '선풍기' '믹서' '믹서']


### sklearn.preprocessing.OneHotEncoder

* class sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None)

* parameters 
    - categories:
    - drop:
        - None : retain all features (the default).

        - first’ : drop the first category in each feature. If only one category is present, the feature will be dropped entirely.

        - if_binary’ : drop the first category in each feature with two categories. Features with 1 or more than 2 categories are left intact.

        - array : drop[i] is the category in feature X[:, i] that should be dropped
    - sparse:  return sparse matrix 
    - dtype : 
    - handle_unknown: 다른 값이 들어오면 무시할지 에러처리할지 default = 'error'
    - min_frequency:
    - max_categories: 최대 categories 지정

In [17]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [77]:
items = ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

oh_encoder = OneHotEncoder()
## 리스트 형태의 데이터를 array로 바꿔 차원을 2차원 배열로 변경한다는 의미
## 여기서 reshape(8,1) = reshape(-1,1)과 같은 의미로 일단 1열로 정리하면 알아서
## 행의 값이 지정

items = np.array(items).reshape(-1,1)
oh_labels =  oh_encoder.fit_transform(items)
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [44]:
items = ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

## LabelEncoder 를 객체로 생성한 후, fit() 과 transform()으로 label인코딩 수행.
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
# 2차원 데이터로 변환합니다 transpose시킴 행벡터를 열벡터로
labels = labels.reshape(-1,1)

## 원- 핫 인코딩을 적용합니다.
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
print('원-핫 인코딩 데이터;')
print(oh_labels.toarray())
print('데이터 차원:',oh_labels.shape)

원-핫 인코딩 데이터;
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
데이터 차원: (8, 6)


In [32]:
import numpy as np
X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
ohe = OneHotEncoder(max_categories=3, sparse=False).fit(X)
ohe.infrequent_categories_

ohe.transform([["a"], ["b"]])



array([[0., 0., 1.],
       [1., 0., 0.]])

In [33]:
enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(X)

enc.categories_

enc.transform([['Female', 1], ['Male', 4]]).toarray()


enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])


enc.get_feature_names_out(['gender', 'group'])


array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
      dtype=object)

### 판다스에서 지원하는 원-핫 인코딩 API get_dummies()
## pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)[source]

* Parameters
    - data : array-like, Series, or DataFrame
        Data of which to get dummy indicators.

    - prefix : str, list of str, or dict of str, default None
        String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, prefix can be a dictionary mapping column names to prefixes.

    - prefix_sep : str, default ‘_’
        If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with prefix.

    - dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.

    - columns  : list-like, default None
        Column names in the DataFrame to be encoded. If columns is None then all the columns with object, string, or category dtype will be converted.

    - sparse : bool, default False
        Whether the dummy-encoded columns should be backed by a SparseArray (True) or a regular NumPy array (False).

    - drop_first : bool, default False
        Whether to get k-1 dummies out of k categorical levels by removing the first level.

    - dtype : dtype, default np.uint8
        Data type for new columns. Only a single dtype is allowed.

In [68]:
import pandas as pd
df = pd.DataFrame({'item':["TV","냉장고","전자렌지","컴퓨터","선풍기","선풍기","믹서","믹서"]})
pd.get_dummies(df['item'])

Unnamed: 0,TV,냉장고,믹서,선풍기,전자렌지,컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0
