In [299]:
import numpy as numpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [300]:
train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

In [301]:
train_x_saved = train_x.copy()
test_x_saved = test_x.copy()

In [302]:
def load_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
    return train_x, test_x

In [303]:
# 変換する数値変数をリストに格納
num_cols = ['age', 'height', 'weight', 'amount',
            'medical_info_a1', 'medical_info_a2', 'medical_info_a3', 'medical_info_b1']

In [304]:
train_x, test_x = load_data()

### 3.4.1 標準化(standardization)

In [305]:
from sklearn.preprocessing import StandardScaler

In [306]:
# 学習データに基づいて，複数列の標準化を定義
scaler = StandardScaler()
scaler.fit(train_x[num_cols])

In [307]:
train_x[num_cols] = scaler.transform(train_x[num_cols])
test_x[num_cols] = scaler.transform(test_x[num_cols])

↑ fitメソッドで計算した標準偏差を用いて，学習データとテストデータの変換を行っている

### 3.4.2 Min-Max スケーリング

In [308]:
from sklearn.preprocessing import MinMaxScaler

In [309]:
scaler = MinMaxScaler()
scaler.fit(train_x[num_cols])

In [310]:
train_x[num_cols] = scaler.transform(train_x[num_cols])
test_x[num_cols] = scaler.transform(test_x[num_cols])

### 3.4.3 非線形変換

In [311]:
x = np.array([1.0, 10.0, 100.0, 1000.0, 10000.0])

# 単に対数を取る
x1 = np.log(x)

# 1を加えたあとに対数をとる log(x+1)
x2 = np.log1p(x)

# 絶対値の対数をとってから元の符号を付加する
x3 = np.sign(x) * np.log(np.abs(x))

↓ Box-Cox変換

In [312]:
train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

In [313]:
train_x[num_cols].max()

age                7.900000e+01
height             1.970077e+02
weight             1.139179e+02
amount             1.000000e+07
medical_info_a1    7.060000e+02
medical_info_a2    6.560000e+02
medical_info_a3    9.000000e+00
medical_info_b1    1.900000e+01
dtype: float64

In [314]:
# 正の値のみをとる変数を変換対象としてリストに格納する
# なお、欠損値も含める場合は、(~(train_x[c] <= 0.0)).all() などとする必要があるので注意
pos_cols = [c for c in num_cols if (train_x[c] > 0.0).all() and (test_x[c] > 0.0).all()]

In [315]:
from sklearn.preprocessing import PowerTransformer

# 学習データに基づいて複数列のBox-Cox変換を定義
pt = PowerTransformer(method='box-cox')
pt.fit(train_x[pos_cols])

# 変換後のデータで各列を置換
train_x[pos_cols] = pt.transform(train_x[pos_cols])
test_x[pos_cols] = pt.transform(test_x[pos_cols])

↓ Yeo-Johnson 変換

In [316]:
train_x, test_x = load_data()

In [317]:
# 学習データに基づいて複数列のYeo-Johnson変換を定義
pt = PowerTransformer(method='yeo-johnson')
pt.fit(train_x[num_cols])

# 変換後のデータで各列を置換
train_x[num_cols] = pt.transform(train_x[num_cols])
test_x[num_cols] = pt.transform(test_x[num_cols])