# preprocessing : 데이터 전처리 모듈

- 표준화 클래스 : StandardScaler
- 정규화 클래스 : MinMaxScaler

## 표준화 클래스 : StandardSacler

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df) # fit : 표준화에 필요한 정보들을 가져온다 + transform : 실제적으로 scaling 변환을 거친다
iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names) # fit_transform() return값이 numpy 결과여서 다시 DataFrame으로 바꿔야 한다

iris_df_scaled.describe() # 표준화된 결과 출력 (e+02 == 10**2 == 100)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.468455e-15,-1.823726e-15,-1.610564e-15,-9.473903e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


In [10]:
X_train, X_test, y_train, y_test = train_test_split(iris_df_scaled, iris.target, test_size=0.3)

model = LogisticRegression()
model.fit(X_train, y_train)

print("훈련 데이터 점수 : {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수 : {}".format(model.score(X_test, y_test)))

훈련 데이터 점수 : 0.9809523809523809
평가 데이터 점수 : 0.9111111111111111


## 정규화 클래스 : MinMaxScaler

In [12]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
iris_scaled = scaler.fit_transform(iris_df)# fit : 정규화에 필요한 정보들을 가져온다 + transform : 실제적으로 scaling 변환을 거친다
iris_df_scaled = pd.DataFrame(data=iris_scaled, columns=iris.feature_names) # fit_transform() return값이 numpy 결과여서 다시 DataFrame으로 바꿔야 한다

iris_df_scaled.describe() # 정규화된 결과 출력 (최소값 == 0 / 최대값 == 1)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(iris_df_scaled, iris.target, test_size=0.3)

model = LogisticRegression()
model.fit(X_train, y_train)

print("훈련 데이터 점수 : {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수 : {}".format(model.score(X_test, y_test)))

훈련 데이터 점수 : 0.9238095238095239
평가 데이터 점수 : 0.9333333333333333
