## 例5-1. One-Hotエンコーディングとダミーコーディングを利用した線形回帰モデリング

In [1]:
import pandas as pd
from sklearn import linear_model

# 3つの都市におけるアパートの家賃のデータセットを設定
df = pd.DataFrame({
    'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC', 'Seattle', 'Seattle', 'Seattle'],
    'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]
})

df['Rent'].mean()

3333.3333333333335

In [2]:
# One-Hotエンコーディングをカテゴリ値であるcity列に適用
# 特徴量をOne-Hotエンコーディングで生成した列に、ターゲット変数を家賃に指定し、線形回帰モデルを学習
one_hot_df = pd.get_dummies(df, prefix=['city'])
one_hot_df

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0,1,0
1,4000,0,1,0
2,4001,0,1,0
3,3499,1,0,0
4,3500,1,0,0
5,3501,1,0,0
6,2499,0,0,1
7,2500,0,0,1
8,2501,0,0,1


In [6]:
model = linear_model.LinearRegression()
model.fit(one_hot_df[['city_NYC', 'city_SF', 'city_Seattle']], one_hot_df['Rent'])
model.coef_

array([ 166.66666667,  666.66666667, -833.33333333])

In [7]:
model.intercept_

3333.3333333333335

In [8]:
# ダミーコーディングを利用して線形回帰モデルを学習
dummy_df = pd.get_dummies(df, prefix=['city'], drop_first=True)
dummy_df

Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,1,0
1,4000,1,0
2,4001,1,0
3,3499,0,0
4,3500,0,0
5,3501,0,0
6,2499,0,1
7,2500,0,1
8,2501,0,1


In [9]:
model.fit(dummy_df[['city_SF', 'city_Seattle']], dummy_df['Rent'])
model.coef_

array([  500., -1000.])

In [10]:
model.intercept_

3500.0

## 例5-2. Effectコーディングを用いた線形回帰モデル

In [11]:
effect_df = dummy_df.copy()
effect_df.loc[3:5, ['city_SF', 'city_Seattle']] = -1.0
effect_df

Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,1.0,0.0
1,4000,1.0,0.0
2,4001,1.0,0.0
3,3499,-1.0,-1.0
4,3500,-1.0,-1.0
5,3501,-1.0,-1.0
6,2499,0.0,1.0
7,2500,0.0,1.0
8,2501,0.0,1.0


In [12]:
model.fit(effect_df[['city_SF', 'city_Seattle']], effect_df['Rent'])
model.coef_

array([ 666.66666667, -833.33333333])

In [13]:
model.intercept_

3333.3333333333335