# 範例 : (Kaggle)房價預測
***
- 以下用房價預測資料, 觀察均值編碼的效果

# [教學目標]
- 以下用房價預測資料, 觀察均值編碼的效果

# [範例重點]
- 觀察標籤編碼與均值編碼, 在特徵數量 / 線性迴歸分數 / 線性迴歸時間上, 分別有什麼影響 (In[3], Out[3], In[4], Out[4]) 
- 觀察標籤編碼與均值編碼, 在特徵數量 / 梯度提升樹分數 / 梯度提升樹時間上, 分別有什麼影響 (In[5], Out[5], In[6], Out[6]) 

In [1]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder

data_path = '../data/'
df_train = pd.read_csv(data_path + 'house_train.csv.gz')
df_test = pd.read_csv(data_path + 'house_test.csv.gz')

train_Y = np.log1p(df_train['SalePrice'])
ids = df_test['Id']
df_train = df_train.drop(['Id', 'SalePrice'] , axis=1)
df_test = df_test.drop(['Id'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [2]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Numeric Features : {object_features}\n')

# 只留類別型欄位
df = df[object_features]
df = df.fillna('None')
train_num = train_Y.shape[0]
df.head()

43 Numeric Features : ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']



Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [3]:
# 對照組 : 標籤編碼 + 線性迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c]) # 標籤編碼
train_X = df_temp[:train_num]
estimator = LinearRegression() # 線性迴歸
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.66156068668513
time : 0.07907223701477051 sec


In [4]:
df.columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [5]:
data = pd.concat([df[:train_num], train_Y], axis=1)
data

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,SalePrice
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,12.247699
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,RFn,TA,TA,Y,,,,WD,Normal,12.109016
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,12.317171
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Unf,TA,TA,Y,,,,WD,Abnorml,11.849405
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,12.429220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,12.072547
1456,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,Unf,TA,TA,Y,,MnPrv,,WD,Normal,12.254868
1457,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,...,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal,12.493133
1458,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Unf,TA,TA,Y,,,,WD,Normal,11.864469


In [6]:
# 在data中，以LotShape中出現的項目來分類，並將每個分類的SalePrice取平均值。
c = 'LotShape'
data.groupby(['LotShape'])['SalePrice'].mean() 

LotShape
IR1    12.163471
IR2    12.318455
IR3    12.205419
Reg    11.936101
Name: SalePrice, dtype: float64

In [7]:
# Reset index: 重整資訊為一個新的小表
mean_df = data.groupby(['LotShape'])['SalePrice'].mean().reset_index()
mean_df

Unnamed: 0,LotShape,SalePrice
0,IR1,12.163471
1,IR2,12.318455
2,IR3,12.205419
3,Reg,11.936101


In [8]:
print(mean_df.columns)
mean_df.columns = ['LotShape', 'LotShape_mean']
print(mean_df.columns)
mean_df

Index(['LotShape', 'SalePrice'], dtype='object')
Index(['LotShape', 'LotShape_mean'], dtype='object')


Unnamed: 0,LotShape,LotShape_mean
0,IR1,12.163471
1,IR2,12.318455
2,IR3,12.205419
3,Reg,11.936101


In [9]:
# on: 如果重複就省略
data = pd.merge(data, mean_df, on='LotShape', how='left')
data

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,SalePrice,LotShape_mean
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,TA,TA,Y,,,,WD,Normal,12.247699,11.936101
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,TA,TA,Y,,,,WD,Normal,12.109016,11.936101
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,TA,TA,Y,,,,WD,Normal,12.317171,12.163471
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,TA,TA,Y,,,,WD,Abnorml,11.849405,12.163471
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,TA,TA,Y,,,,WD,Normal,12.429220,12.163471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,TA,TA,Y,,,,WD,Normal,12.072547,11.936101
1456,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,TA,TA,Y,,MnPrv,,WD,Normal,12.254868,11.936101
1457,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,...,TA,TA,Y,,GdPrv,Shed,WD,Normal,12.493133,11.936101
1458,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,TA,TA,Y,,,,WD,Normal,11.864469,11.936101


In [10]:
data = data.drop(['LotShape'] , axis=1)
data

Unnamed: 0,MSZoning,Street,Alley,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,SalePrice,LotShape_mean
0,RL,Pave,,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,TA,TA,Y,,,,WD,Normal,12.247699,11.936101
1,RL,Pave,,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,TA,TA,Y,,,,WD,Normal,12.109016,11.936101
2,RL,Pave,,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,TA,TA,Y,,,,WD,Normal,12.317171,12.163471
3,RL,Pave,,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,TA,TA,Y,,,,WD,Abnorml,11.849405,12.163471
4,RL,Pave,,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,TA,TA,Y,,,,WD,Normal,12.429220,12.163471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,TA,TA,Y,,,,WD,Normal,12.072547,11.936101
1456,RL,Pave,,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,...,TA,TA,Y,,MnPrv,,WD,Normal,12.254868,11.936101
1457,RL,Pave,,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,...,TA,TA,Y,,GdPrv,Shed,WD,Normal,12.493133,11.936101
1458,RL,Pave,,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,TA,TA,Y,,,,WD,Normal,11.864469,11.936101


In [11]:
data = data.drop(['SalePrice'] , axis=1)
data

Unnamed: 0,MSZoning,Street,Alley,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,LotShape_mean
0,RL,Pave,,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,11.936101
1,RL,Pave,,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,11.936101
2,RL,Pave,,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,12.163471
3,RL,Pave,,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Unf,TA,TA,Y,,,,WD,Abnorml,12.163471
4,RL,Pave,,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,12.163471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,RFn,TA,TA,Y,,,,WD,Normal,11.936101
1456,RL,Pave,,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,...,Unf,TA,TA,Y,,MnPrv,,WD,Normal,11.936101
1457,RL,Pave,,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,...,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal,11.936101
1458,RL,Pave,,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,Unf,TA,TA,Y,,,,WD,Normal,11.936101


In [12]:
# 均值編碼 + 線性迴歸
data = pd.concat([df[:train_num], train_Y], axis=1)
for c in df.columns:
    mean_df = data.groupby([c])['SalePrice'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    data = pd.merge(data, mean_df, on=c, how='left') #把c當成焦點 #留下左邊c有值，讓右邊跟上
    data = data.drop([c] , axis=1) #丟掉c 只剩c_mean
data = data.drop(['SalePrice'] , axis=1)
estimator = LinearRegression()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, data, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')
data

shape : (1460, 43)
score : 0.7624230403716951
time : 0.025021791458129883 sec


Unnamed: 0,MSZoning_mean,Street_mean,Alley_mean,LotShape_mean,LandContour_mean,Utilities_mean,LotConfig_mean,LandSlope_mean,Neighborhood_mean,Condition1_mean,...,GarageType_mean,GarageFinish_mean,GarageQual_mean,GarageCond_mean,PavedDrive_mean,PoolQC_mean,Fence_mean,MiscFeature_mean,SaleType_mean,SaleCondition_mean
0,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.001906,12.020571,12.163647,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.005040
1,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.034748,12.020571,12.344180,11.818342,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.005040
2,12.085891,12.025529,12.037682,12.163471,12.0227,12.024189,12.001906,12.020571,12.163647,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.005040
3,12.085891,12.025529,12.037682,12.163471,12.0227,12.024189,12.027452,12.020571,12.206664,12.042923,...,11.765651,11.818982,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,11.788783
4,12.085891,12.025529,12.037682,12.163471,12.0227,12.024189,12.034748,12.020571,12.676003,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.005040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.001906,12.020571,12.155809,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.005040
1456,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.001906,12.020571,12.130614,12.042923,...,12.160783,11.818982,12.068546,12.070489,12.059901,12.022122,11.850604,12.030119,11.991068,12.005040
1457,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.001906,12.020571,12.206664,12.042923,...,12.160783,12.172678,12.068546,12.070489,12.059901,12.022122,12.057119,11.868394,11.991068,12.005040
1458,12.085891,12.025529,12.037682,11.936101,12.0227,12.024189,12.001906,12.020571,11.868052,12.042923,...,12.160783,11.818982,12.068546,12.070489,12.059901,12.022122,12.058605,12.030119,11.991068,12.005040


In [13]:
# 對照組 : 標籤編碼 + 梯度提升樹
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = GradientBoostingRegressor()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.7776993714795516
time : 0.7060317993164062 sec


In [14]:
# 均值編碼 + 梯度提升樹
data = pd.concat([df[:train_num], train_Y], axis=1)
for c in df.columns:
    mean_df = data.groupby([c])['SalePrice'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    data = pd.merge(data, mean_df, on=c, how='left')
    data = data.drop([c] , axis=1)
data = data.drop(['SalePrice'] , axis=1)
estimator = GradientBoostingRegressor()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, data, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.8056375372615273
time : 0.6670510768890381 sec


# 作業1
* 請仿照範例，將鐵達尼範例中的類別型特徵改用均值編碼實作一次

# 作業2
* 觀察鐵達尼生存預測中，均值編碼與標籤編碼兩者比較，哪一個效果比較好? 可能的原因是什麼?