### 모듈 불러오기

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import boxcox
from scipy.stats import yeojohnson
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
import seaborn as sns

### 데이터 불러오기

In [2]:
df = pd.read_csv("C:/Users/gun67/전력 프로젝트/preprocessing_data/2015_preprocessing_data", encoding = "utf-8-sig")
df

Unnamed: 0,Year-Month-Date,Time,Watt,Temp('C),Humidity(%)
0,2015-01-01,[ 00:00 ],190.0,6.0,76.0
1,2015-01-01,[ 00:01 ],189.0,6.0,76.0
2,2015-01-01,[ 00:02 ],190.0,6.0,76.0
3,2015-01-01,[ 00:03 ],190.0,6.0,76.0
4,2015-01-01,[ 00:04 ],189.0,6.0,76.0
...,...,...,...,...,...
525595,2015-12-31,[ 23:55 ],2612.0,7.0,76.0
525596,2015-12-31,[ 23:56 ],2613.0,7.0,76.0
525597,2015-12-31,[ 23:57 ],2608.0,7.0,76.0
525598,2015-12-31,[ 23:58 ],2618.0,7.0,76.0


### Min-Max 스케일링  
##### 데이터의 최솟값을 0, 최댓값을 1로 변환 ; 데이터의 분포를 유지하면서 스케일을 변경

In [3]:
scaler = MinMaxScaler()
df["Watt_minmax"] = scaler.fit_transform(df[["Watt"]])
df["Temp('C)_minmax"] = scaler.fit_transform(df[["Temp('C)"]])
df["Humidity(%)_minmax"] = scaler.fit_transform(df[["Humidity(%)"]])
df_minmax = df[["Year-Month-Date", "Time", "Watt_minmax", "Temp('C)_minmax", "Humidity(%)_minmax"]]
print(df_minmax)

       Year-Month-Date       Time  Watt_minmax  Temp('C)_minmax  \
0           2015-01-01  [ 00:00 ]     0.015171         0.222222   
1           2015-01-01  [ 00:01 ]     0.015041         0.222222   
2           2015-01-01  [ 00:02 ]     0.015171         0.222222   
3           2015-01-01  [ 00:03 ]     0.015171         0.222222   
4           2015-01-01  [ 00:04 ]     0.015041         0.222222   
...                ...        ...          ...              ...   
525595      2015-12-31  [ 23:55 ]     0.329227         0.250000   
525596      2015-12-31  [ 23:56 ]     0.329357         0.250000   
525597      2015-12-31  [ 23:57 ]     0.328709         0.250000   
525598      2015-12-31  [ 23:58 ]     0.330005         0.250000   
525599      2015-12-31  [ 23:59 ]     0.035140         0.250000   

        Humidity(%)_minmax  
0                  0.66605  
1                  0.66605  
2                  0.66605  
3                  0.66605  
4                  0.66605  
...                  

### 로버스트 스케일링

##### 중앙값(median)을 0, IQR을 1로 변환 ; 이상치에 덜 민감

In [4]:
scaler = RobustScaler()
df["Watt_robust"] = scaler.fit_transform(df[["Watt"]])
df["Temp('C)_robust"] = scaler.fit_transform(df[["Temp('C)"]])
df["Humidity(%)_robust"] = scaler.fit_transform(df[["Humidity(%)"]])
df_robust = df[["Year-Month-Date", "Time", "Watt_robust", "Temp('C)_robust", "Humidity(%)_robust"]]
print(df_robust)

       Year-Month-Date       Time  Watt_robust  Temp('C)_robust  \
0           2015-01-01  [ 00:00 ]    -0.399123        -1.000000   
1           2015-01-01  [ 00:01 ]    -0.403509        -1.000000   
2           2015-01-01  [ 00:02 ]    -0.399123        -1.000000   
3           2015-01-01  [ 00:03 ]    -0.399123        -1.000000   
4           2015-01-01  [ 00:04 ]    -0.403509        -1.000000   
...                ...        ...          ...              ...   
525595      2015-12-31  [ 23:55 ]    10.223684        -0.857143   
525596      2015-12-31  [ 23:56 ]    10.228070        -0.857143   
525597      2015-12-31  [ 23:57 ]    10.206140        -0.857143   
525598      2015-12-31  [ 23:58 ]    10.250000        -0.857143   
525599      2015-12-31  [ 23:59 ]     0.276316        -0.857143   

        Humidity(%)_robust  
0                     0.17  
1                     0.17  
2                     0.17  
3                     0.17  
4                     0.17  
...                  

### 로그 변환

##### 데이터의 분포를 정규분포에 가깝게 만듦, 값이 0보다 클 때만 사용 가능  
##### 온도 데이터는 값이 0보다 작거나 같을 경우가 존재하므로 다른 정규화를 사용

In [5]:
df["Watt_log"] = np.log(df["Watt"] + 1)
df["Humidity(%)_log"] = np.log(df["Humidity(%)"] + 1)

# 전력값과 습도는 log 변환, 온도는 minmax 변환
df_log_minmax = df[["Year-Month-Date", "Time", "Watt_log", "Temp('C)_minmax", "Humidity(%)_log"]]
print(df_log_minmax)

# 전력값과 습도는 log 변환, 온도는 robust 변환
df_log_robust = df[["Year-Month-Date", "Time", "Watt_log", "Temp('C)_robust", "Humidity(%)_log"]]
print(df_log_robust)

       Year-Month-Date       Time  Watt_log  Temp('C)_minmax  Humidity(%)_log
0           2015-01-01  [ 00:00 ]  5.252273         0.222222         4.343805
1           2015-01-01  [ 00:01 ]  5.247024         0.222222         4.343805
2           2015-01-01  [ 00:02 ]  5.252273         0.222222         4.343805
3           2015-01-01  [ 00:03 ]  5.252273         0.222222         4.343805
4           2015-01-01  [ 00:04 ]  5.247024         0.222222         4.343805
...                ...        ...       ...              ...              ...
525595      2015-12-31  [ 23:55 ]  7.868254         0.250000         4.343805
525596      2015-12-31  [ 23:56 ]  7.868637         0.250000         4.343805
525597      2015-12-31  [ 23:57 ]  7.866722         0.250000         4.343805
525598      2015-12-31  [ 23:58 ]  7.870548         0.250000         4.343805
525599      2015-12-31  [ 23:59 ]  5.843544         0.250000         4.343805

[525600 rows x 5 columns]
       Year-Month-Date       Time  Wa

### 제곱근 변환

##### 데이터의 분포를 정규분포에 가깝게 만듦, 값이 0보다 클 때만 사용 가능  
##### 이 또한 온도 데이터는 다른 정규화를 사용

In [6]:
df["Watt_sqrt"] = np.sqrt(df["Watt"])
df["Humidity(%)_sqrt"] = np.sqrt(df["Watt"])

# 전력값과 습도는 sqrt 변환, 온도는 minmax 변환
df_sqrt_minmax = df[["Year-Month-Date", "Time", "Watt_sqrt", "Temp('C)_minmax", "Humidity(%)_sqrt"]]
print(df_sqrt_minmax)

# 전력값과 습도는 sqrt 변환, 온도는 robust 변환
df_sqrt_robust = df[["Year-Month-Date", "Time", "Watt_sqrt", "Temp('C)_robust", "Humidity(%)_sqrt"]]
print(df_sqrt_robust)

       Year-Month-Date       Time  Watt_sqrt  Temp('C)_minmax  \
0           2015-01-01  [ 00:00 ]  13.784049         0.222222   
1           2015-01-01  [ 00:01 ]  13.747727         0.222222   
2           2015-01-01  [ 00:02 ]  13.784049         0.222222   
3           2015-01-01  [ 00:03 ]  13.784049         0.222222   
4           2015-01-01  [ 00:04 ]  13.747727         0.222222   
...                ...        ...        ...              ...   
525595      2015-12-31  [ 23:55 ]  51.107729         0.250000   
525596      2015-12-31  [ 23:56 ]  51.117512         0.250000   
525597      2015-12-31  [ 23:57 ]  51.068581         0.250000   
525598      2015-12-31  [ 23:58 ]  51.166395         0.250000   
525599      2015-12-31  [ 23:59 ]  18.547237         0.250000   

        Humidity(%)_sqrt  
0              13.784049  
1              13.747727  
2              13.784049  
3              13.784049  
4              13.747727  
...                  ...  
525595         51.107729  
525

In [7]:
df_minmax.to_csv("C:/Users/gun67/전력 프로젝트/2015_normalization_data/2015_minmax_data", index = False, encoding = "utf-8-sig")

In [8]:
df_robust.to_csv("C:/Users/gun67/전력 프로젝트/2015_normalization_data/2015_robust_data", index = False, encoding = "utf-8-sig")

In [9]:
df_log_minmax.to_csv("C:/Users/gun67/전력 프로젝트/2015_normalization_data/2015_log_minmax_data", index = False, encoding = "utf-8-sig")

In [10]:
df_log_robust.to_csv("C:/Users/gun67/전력 프로젝트/2015_normalization_data/2015_log_robust_data", index = False, encoding = "utf-8-sig")

In [11]:
df_sqrt_minmax.to_csv("C:/Users/gun67/전력 프로젝트/2015_normalization_data/2015_sqrt_minmax_data", index = False, encoding = "utf-8-sig")

In [12]:
df_sqrt_robust.to_csv("C:/Users/gun67/전력 프로젝트/2015_normalization_data/2015_sqrt_robust_data", index = False, encoding = "utf-8-sig")