In [7]:
# 정규화 (Normalization) 
# 정규화는 데이터의 값들을 [0,1] 사이의 값이 되게 변환

In [8]:
import pandas as pd
lst = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
df = pd.DataFrame(lst, columns=['A','B'])
df

Unnamed: 0,A,B
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [9]:
# 1)수식을 이용해서 직접 구현: (X-Min) / (Max-Min)
def normalize(s):
    return (s-s.min())/(s.max()-s.min())
    
# normalize(df['A'].to_list())
dic={}
dic['A']=normalize(df['A'])
dic['B']=normalize(df['B'])
df1 = pd.DataFrame(dic)
df1

Unnamed: 0,A,B
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,1.0,1.0


In [10]:
# 2) MinMaxScaler 이용
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
arr = scaler.transform(df)
df1 = pd.DataFrame(arr, columns=df.columns)
df1

Unnamed: 0,A,B
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,1.0,1.0


In [11]:
scaler.data_min_, scaler.data_max_,scaler.data_range_

(array([-1.,  2.]), array([ 1., 18.]), array([ 2., 16.]))

In [12]:
df

Unnamed: 0,A,B
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [None]:
# 표준화 (Standardization)
# 표준화는 데이터의 값들을 평균은 0, 분산은 1이 되게 변환하는 것 (Z점수 정규화라고도 함)

In [13]:
df = pd.DataFrame({
    'A':[1,2,3,4,5,6],
    'B':[101,102,103,104,105,106]
})
df

Unnamed: 0,A,B
0,1,101
1,2,102
2,3,103
3,4,104
4,5,105
5,6,106


In [14]:
# 1) 수식을 직접 구현 : (X - 평균) / 표준편차
import numpy as np

def standardization(s):
    return (s-s.mean()) / s.std(ddof=0) #모집단이라고 가정

df1 = pd.DataFrame({
    'A':standardization(df['A']),
    'B':standardization(df['B'])
})
df1

Unnamed: 0,A,B
0,-1.46385,-1.46385
1,-0.87831,-0.87831
2,-0.29277,-0.29277
3,0.29277,0.29277
4,0.87831,0.87831
5,1.46385,1.46385


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
arr = scaler.transform(df)
df1 = pd.DataFrame(arr, columns=df.columns )
df1

Unnamed: 0,A,B
0,-1.46385,-1.46385
1,-0.87831,-0.87831
2,-0.29277,-0.29277
3,0.29277,0.29277
4,0.87831,0.87831
5,1.46385,1.46385


In [16]:
scaler.mean_, scaler.var_

(array([  3.5, 103.5]), array([2.91666667, 2.91666667]))

In [17]:
df.mean(), df.var(ddof=0)

(A      3.5
 B    103.5
 dtype: float64,
 A    2.916667
 B    2.916667
 dtype: float64)

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df[['age']])
arr = scaler.transform(df[['age']])
arr
df1 = pd.DataFrame(arr, columns=['age'])
df1

Unnamed: 0,age
0,0.708333
1,0.166667
2,0.250000
3,0.562500
4,0.583333
...,...
298,0.583333
299,0.333333
300,0.812500
301,0.583333


In [5]:
# 평균
df1['age'].mean()

0.5284653465346535

In [6]:
# 수치형 칼럼 데이터를 모두 표준화(Standardization) 후 출력
df_tgt = df[[ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_tgt)
arr = scaler.transform(df_tgt)
df1 = pd.DataFrame(arr, columns=df_tgt.columns)
df1
df1 = pd.concat([df1,df[['sex','cp','fbs','restecg','exang','slope', 'ca','thal','target']]], axis=1)
df1

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex,cp,fbs,restecg,exang,slope,ca,thal,target
0,0.952197,0.763956,-0.256334,0.015443,1.087338,1,3,1,0,0,0,0,1,1
1,-1.915313,-0.092738,0.072199,1.633471,2.122573,1,2,0,1,0,0,0,2,1
2,-1.474158,-0.092738,-0.816773,0.977514,0.310912,0,1,0,0,0,2,0,2,1
3,0.180175,-0.663867,-0.198357,1.239897,-0.206705,1,1,0,1,0,2,0,2,1
4,0.290464,-0.663867,2.082050,0.583939,-0.379244,0,0,0,1,1,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.290464,0.478391,-0.101730,-1.165281,-0.724323,0,0,0,1,1,1,0,3,0
299,-1.033002,-1.234996,0.342756,-0.771706,0.138373,1,3,0,1,0,1,0,3,0
300,1.503641,0.706843,-1.029353,-0.378132,2.036303,1,0,1,1,0,1,2,3,0
301,0.290464,-0.092738,-2.227533,-1.515125,0.138373,1,0,0,1,1,1,1,3,0
