# 데이터 전처리

## 1. 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
items=['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '믹서', '믹서']

In [3]:
# 객체 생성
le=LabelEncoder()

In [9]:
# 훈련 시작
le.fit(items)

LabelEncoder()

In [10]:
# 인코딩 진행
labels=le.transform(items)
labels

array([0, 1, 4, 5, 3, 2, 2])

In [11]:
# 실전에서 사용하는 방법
le2=LabelEncoder()
labels=le2.fit_transform(items)  # 이렇게 하면 모델이 훈련되는 동시에 레이블 인코딩을 진행함
labels

array([0, 1, 4, 5, 3, 2, 2])

In [12]:
# 더 심화과정
labels=LabelEncoder().fit_transform(items)
labels

array([0, 1, 4, 5, 3, 2, 2])

In [13]:
# 디코딩(숫자로 된 index를 다시 문자로 돌려주는 과정)
le.inverse_transform([2,1,4,3,2,4,2,1,1,2,4,3,2,1,5])

array(['믹서', '냉장고', '전자레인지', '선풍기', '믹서', '전자레인지', '믹서', '냉장고', '냉장고',
       '믹서', '전자레인지', '선풍기', '믹서', '냉장고', '컴퓨터'], dtype='<U5')

## 2. One-Hot encoding

In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder()
oh_labels=ohe.fit_transform(labels.reshape(-1,1))  # 얘는 2차원 배열로 만들어줘야 인식을 함
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [20]:
from tensorflow.keras.utils import to_categorical
to_categorical(labels)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

## 3. 표준화
- dataset을 평균이 0, 표준편차가 1인 가우시안 표준정규분포로 만들어 줌

In [22]:
from sklearn.datasets import load_iris
iris=load_iris()

In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

df=pd.DataFrame(iris.data, columns=iris.feature_names)
df.describe()  # 기존 데이터의 기초통계량 확인

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [26]:
iris_std=StandardScaler().fit_transform(iris.data)
df=pd.DataFrame(iris_std, columns=iris.feature_names)
df.describe()  # 평균이 0에 매우 근접하고 표준편차가 1에 매우 근접한 결과가 나옴

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


- 표준화의 장점 : Logisic Regression으로 분류해보기

In [28]:
from sklearn.linear_model import LogisticRegression
lrc=LogisticRegression(random_state=2022)


In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(
    iris.data, iris.target, stratify=iris.target, random_state=2022
)
lrc.fit(X_train, y_train)   # 이러면 에러 발생(최대 iteration에 도달)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=2022)

In [39]:
X_train, X_test, y_train, y_test=train_test_split(
    iris_std, iris.target, stratify=iris.target, random_state=2022
)
lrc.fit(X_train, y_train)  # 표준정규분포로 만든 데이터를 사용하면 빨리 목표값에 도달하기 때문에 최대 iteration에 도달했다는 에러가 발생하지 않음

LogisticRegression(random_state=2022)

In [37]:
lrc.score(X_test, y_test)

0.9473684210526315

## 4. 정규화
- dataset을 최소값이 0, 최댓값이 1이 되게 변환

In [41]:
from sklearn.preprocessing import MinMaxScaler
iris_mm=MinMaxScaler().fit_transform(iris.data)

In [45]:
df3=pd.DataFrame(iris_mm, columns=iris.feature_names)
df3.describe()  # min이 0, max가 1로 바뀌었음을 확인 가능

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [46]:
X_train, X_test, y_train, y_test=train_test_split(
    iris_mm, iris.target, stratify=iris.target, random_state=2022
)   # 이 경우도 표준화와 마찬가지로 아무 문제 없이 잘 돌아감
lrc2=LogisticRegression(random_state=2022)
lrc2.fit(X_train, y_train)
lrc2.score(X_test, y_test)

0.9210526315789473