In [1]:
#데이터 전처리: 데이터 정규화와 표준화 (비선형 변환 포함)

In [3]:
#학습에 필요한 scikit-learn 설치
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
data = {
    '특성1' : [10,20,30,40,50],
    '특성2': [1,2,3,4,5]
}

In [15]:
df = pd.DataFrame(data)

In [25]:
# Min-Max 정규화
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [27]:
normalized_df

Unnamed: 0,특성1,특성2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


In [30]:
#데이터 표준화(Standardization)
#표준화(Standardization)는 데이터를 평균이 0, 표준편차가 1이 되도록 변환하는 과정.
#이는 정규분포를 가정한 많은 분석 기법에 유리

In [32]:
#Z-점수 표준화 : 데이터에서 평균을 뺴고 표준편차로 나누어, 모든 데이터가 표준 정규분포(평균 0, 표준편차 1)를 따르도록 만듦.

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
standardized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [42]:
standardized_df

Unnamed: 0,특성1,특성2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


In [44]:
standardized_df.describe()

Unnamed: 0,특성1,특성2
count,5.0,5.0
mean,0.0,0.0
std,1.118034,1.118034
min,-1.414214,-1.414214
25%,-0.707107,-0.707107
50%,0.0,0.0
75%,0.707107,0.707107
max,1.414214,1.414214


In [52]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(url)
age_data = titanic[['Age']]

#결측치 제거
age_data = age_data.dropna()

In [54]:
age_data

Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
885,39.0
886,27.0
887,19.0
889,26.0


In [56]:
scaler = StandardScaler()
age_scaled = scaler.fit_transform(age_data)

In [60]:
age_scaled_df = pd.DataFrame(age_scaled, columns=['age_scaled'])

In [62]:
age_scaled_df

Unnamed: 0,age_scaled
0,-0.530377
1,0.571831
2,-0.254825
3,0.365167
4,0.365167
...,...
709,0.640719
710,-0.185937
711,-0.737041
712,-0.254825


In [64]:
age_scaled_df.describe()

Unnamed: 0,age_scaled
count,714.0
mean,2.338621e-16
std,1.000701
min,-2.016979
25%,-0.6595416
50%,-0.1170488
75%,0.571831
max,3.465126


In [67]:
#비선형 변환(Non-linear Transformation)

In [69]:
import numpy as np

In [71]:
df['특성1_log'] = np.log(df['특성1'])

In [73]:
df

Unnamed: 0,특성1,특성2,특성1_log
0,10,1,2.302585
1,20,2,2.995732
2,30,3,3.401197
3,40,4,3.688879
4,50,5,3.912023


In [75]:
df.describe()

Unnamed: 0,특성1,특성2,특성1_log
count,5.0,5.0,5.0
mean,30.0,3.0,3.260083
std,15.811388,1.581139,0.635509
min,10.0,1.0,2.302585
25%,20.0,2.0,2.995732
50%,30.0,3.0,3.401197
75%,40.0,4.0,3.688879
max,50.0,5.0,3.912023


In [78]:
#제곱근 변환

In [80]:
df['특성1_sqrt'] = np.sqrt(df['특성1'])

In [82]:
df

Unnamed: 0,특성1,특성2,특성1_log,특성1_sqrt
0,10,1,2.302585,3.162278
1,20,2,2.995732,4.472136
2,30,3,3.401197,5.477226
3,40,4,3.688879,6.324555
4,50,5,3.912023,7.071068


In [85]:
#박스-콕스(Box-Cox) 변환
#박스-콕스 변환은 다양한 형태의 데이터 분포를 정규분포에 가깝게 변환하기 위해 사용.
#이 방법은 양수 데이터에서만 사용 가능.

In [93]:
#scipy 라이브러리 설치
!pip install scipy



In [95]:
from scipy.stats import boxcox

In [97]:
# 박스-콕스 변환
df['특성1_boxcox'], _ = boxcox(df['특성1'])

In [99]:
df

Unnamed: 0,특성1,특성2,특성1_log,특성1_sqrt,특성1_boxcox
0,10,1,2.302585,3.162278,5.651388
1,20,2,2.995732,4.472136,10.008082
2,30,3,3.401197,5.477226,13.708444
3,40,4,3.688879,6.324555,17.038108
4,50,5,3.912023,7.071068,20.116745


In [101]:
from sklearn.preprocessing import RobustScaler

In [103]:
scaler = RobustScaler()

In [105]:
scaled_df = scaler.fit_transform(df)

In [107]:
scaled_df = pd.DataFrame(scaled_df, columns = df.columns)
scaled_df

Unnamed: 0,특성1,특성2,특성1_log,특성1_sqrt,특성1_boxcox
0,-1.0,-1.0,-1.584963,-1.249689,-1.146092
1,-0.5,-0.5,-0.584963,-0.542582,-0.526365
2,0.0,0.0,0.0,0.0,0.0
3,0.5,0.5,0.415037,0.457418,0.473635
4,1.0,1.0,0.736966,0.860411,0.911561
