### 데이터 전처리

###### 1. 데이터 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

In [3]:
# 레이블 인코더 객체 생성
le = LabelEncoder()

In [4]:
# 학습시킬 데이터를 훑어보는 과정
le.fit(items)

LabelEncoder()

In [6]:
# 인코딩 실행
labels=le.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [7]:
# 실전에서는 (fit과 transform을 한 번에)
le2 = LabelEncoder()
labels = le2.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [8]:
# 숙련자들은 한 줄로 끝낸다
labels = LabelEncoder().fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [9]:
le.inverse_transform([2,4,3,0,5,1]) # 디코딩

array(['믹서', '전자레인지', '선풍기', 'TV', '컴퓨터', '냉장고'], dtype='<U5')

###### 2. One - hot Encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
oh_labels = ohe.fit_transform(labels.reshape(-1,1))
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [12]:
from tensorflow.keras.utils import to_categorical
to_categorical(labels)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

###### 3. 표준화
 - 평균 0, 표준편차 1인 가우시안 표준 정규분포로 변환

In [13]:
from sklearn.datasets import load_iris
iris = load_iris()

In [14]:
from sklearn.preprocessing import StandardScaler
iris_std = StandardScaler().fit_transform(iris.data)

In [15]:
import pandas as pd
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [16]:
df = pd.DataFrame(iris_std, columns = iris.feature_names)
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


- Logistic Regression으로 분류

In [17]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2022)

In [18]:
# 표준 정규분포로 변환된 데이터로 학습을 시키면 빨리 목표값에 도달
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_std, iris.target, stratify = iris.target, random_state=2022)

In [19]:
lrc.fit(X_train, y_train)

LogisticRegression(random_state=2022)

In [20]:
lrc.score(X_test, y_test)

0.9473684210526315

###### 4. 정규화
 - 최소값 0, 최대값 1로 변환

In [21]:
from sklearn.preprocessing import MinMaxScaler
iris_mn = MinMaxScaler().fit_transform(iris.data)


In [22]:
df3 = pd.DataFrame(iris_mn, columns = iris.feature_names)
df3.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [23]:
# 정규화된 데이터로 학습하면 빨리 목표값에 도달하여 에러가 발생하지 않음
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_mn, iris.target, stratify = iris.target, random_state=2022)

In [24]:
lrc = LogisticRegression(random_state=2022)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.9210526315789473