## 3.5 カテゴリ変数の変換

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('../input/sample-data/train.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test.csv')

In [3]:
train_x_saved = train_x.copy()
test_x_saved = test_x.copy()

In [4]:
def load_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
    return train_x, test_x

In [5]:
cat_cols = ['sex', 'product', 'medical_info_b2', 'medical_info_b3']

### 3.5.1 one-hot encoding

In [6]:
train_x, test_x = load_data()

In [7]:
# 学習データとテストデータを結合してget_dummiesによるone-hot encodingを行う
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x, columns=cat_cols)

# 学習データとテストデータに再分割
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

In [8]:
train_x, test_x = load_data()

In [9]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoderでのencoding
ohe = OneHotEncoder(sparse=False, categories='auto')
ohe.fit(train_x[cat_cols])

# ダミー変数の列名の作成
columns = []
for i, c in enumerate(cat_cols):
    columns += [f'{c}_{v}' for v in ohe.categories_[i]]

# 生成されたダミー変数をデータフレームに変換
dummy_vals_train = pd.DataFrame(ohe.transform(train_x[cat_cols]), columns=columns)
dummy_vals_test = pd.DataFrame(ohe.transform(test_x[cat_cols]), columns=columns)

# 残りの変数と結合
train_x = pd.concat([train_x.drop(cat_cols, axis=1), dummy_vals_train], axis=1)
test_x = pd.concat([test_x.drop(cat_cols, axis=1), dummy_vals_test], axis=1)



### 3.5.2 label encoding

In [10]:
train_x, test_x = load_data()

In [11]:
from sklearn.preprocessing import LabelEncoder

# カテゴリ変数をループしてlabel encoding
for c in cat_cols:
    # 学習データに基づいて定義する
    le = LabelEncoder()
    le.fit(train_x[c])
    train_x[c] = le.transform(train_x[c])
    test_x[c] = le.transform(test_x[c])

### 3.5.3 feature hashing

In [12]:
train_x, test_x = load_data()

In [13]:
from sklearn.feature_extraction import FeatureHasher

# カテゴリ変数をループしてfeature hashing
for c in cat_cols:
    # FeatureHasherの使い方は、他のencoderとは少し異なる

    fh = FeatureHasher(n_features=5, input_type='string')
    # 変数を文字列に変換してからFeatureHasherを適用
    hash_train = fh.transform(train_x[[c]].astype(str).values)
    hash_test = fh.transform(test_x[[c]].astype(str).values)
    # データフレームに変換
    hash_train = pd.DataFrame(hash_train.todense(), columns=[f'{c}_{i}' for i in range(5)])
    hash_test = pd.DataFrame(hash_test.todense(), columns=[f'{c}_{i}' for i in range(5)])
    # 元のデータフレームと結合
    train_x = pd.concat([train_x, hash_train], axis=1)
    test_x = pd.concat([test_x, hash_test], axis=1)

### 3.5.4 frequency encoding

In [14]:
train_x, test_x = load_data()

In [15]:
# 変数をループしてfrequency encoding
for c in cat_cols:
    freq = train_x[c].value_counts()
    # カテゴリの出現回数で置換
    train_x[c] = train_x[c].map(freq)
    test_x[c] = test_x[c].map(freq)

### 3.5.5 target encoding

In [16]:
train_x, test_x = load_data()

In [17]:
from sklearn.model_selection import KFold

# 変数をループしてtarget encoding
for c in cat_cols:
    # 学習データ全体で各カテゴリにおけるtargetの平均を計算
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    target_mean = data_tmp.groupby(c)['target'].mean()
    # テストデータのカテゴリを置換
    test_x[c] = test_x[c].map(target_mean)

    # 学習データの変換後の値を格納する配列を準備
    tmp = np.repeat(np.nan, train_x.shape[0])

    # 学習データを分割
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    for idx_1, idx_2 in kf.split(train_x):
        # out-of-foldで各カテゴリにおける目的変数の平均を計算
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        # 変換後の値を一時配列に格納
        tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)

    # 変換後のデータで元の変数を置換
    train_x[c] = tmp