# 範例 : (Kaggle)房價預測
***
- 以下用房價預測資料, 觀察均值編碼的效果

In [1]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder

data_path = 'data/'
df_train = pd.read_csv(data_path + 'house_train.csv.gz')
df_test = pd.read_csv(data_path + 'house_test.csv.gz')

train_Y = np.log1p(df_train['SalePrice'])
ids = df_test['Id']
df_train = df_train.drop(['Id', 'SalePrice'] , axis=1)
df_test = df_test.drop(['Id'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [2]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Numeric Features : {object_features}\n')

# 只留類別型欄位
df = df[object_features]
df = df.fillna('None')
train_num = train_Y.shape[0]
df.head()

43 Numeric Features : ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']



Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [3]:
# 對照組 : 標籤編碼 + 線性迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LinearRegression()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.6615606866851301
time : 0.029255151748657227 sec


In [6]:
# 均值編碼 + 線性迴歸
data = pd.concat([df[:train_num], train_Y], axis=1)
for c in df.columns:
    mean_df = data.groupby([c])['SalePrice'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    data = pd.merge(data, mean_df, on=c, how='left')
    data = data.drop([c] , axis=1)
data = data.drop(['SalePrice'] , axis=1)
estimator = LinearRegression()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, data, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

     Street Alley LotShape LandContour Utilities LotConfig LandSlope  \
0      Pave  None      Reg         Lvl    AllPub    Inside       Gtl   
1      Pave  None      Reg         Lvl    AllPub       FR2       Gtl   
2      Pave  None      IR1         Lvl    AllPub    Inside       Gtl   
3      Pave  None      IR1         Lvl    AllPub    Corner       Gtl   
4      Pave  None      IR1         Lvl    AllPub       FR2       Gtl   
5      Pave  None      IR1         Lvl    AllPub    Inside       Gtl   
6      Pave  None      Reg         Lvl    AllPub    Inside       Gtl   
7      Pave  None      IR1         Lvl    AllPub    Corner       Gtl   
8      Pave  None      Reg         Lvl    AllPub    Inside       Gtl   
9      Pave  None      Reg         Lvl    AllPub    Corner       Gtl   
10     Pave  None      Reg         Lvl    AllPub    Inside       Gtl   
11     Pave  None      IR1         Lvl    AllPub    Inside       Gtl   
12     Pave  None      IR2         Lvl    AllPub    Inside      

     HouseStyle RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType  \
0        2Story     Gable  CompShg     VinylSd     VinylSd    BrkFace   
1        1Story     Gable  CompShg     MetalSd     MetalSd       None   
2        2Story     Gable  CompShg     VinylSd     VinylSd    BrkFace   
3        2Story     Gable  CompShg     Wd Sdng     Wd Shng       None   
4        2Story     Gable  CompShg     VinylSd     VinylSd    BrkFace   
5        1.5Fin     Gable  CompShg     VinylSd     VinylSd       None   
6        1Story     Gable  CompShg     VinylSd     VinylSd      Stone   
7        2Story     Gable  CompShg     HdBoard     HdBoard      Stone   
8        1.5Fin     Gable  CompShg     BrkFace     Wd Shng       None   
9        1.5Unf     Gable  CompShg     MetalSd     MetalSd       None   
10       1Story       Hip  CompShg     HdBoard     HdBoard       None   
11       2Story       Hip  CompShg     WdShing     Wd Shng      Stone   
12       1Story       Hip  CompShg     HdBoard     

     BsmtExposure BsmtFinType1 BsmtFinType2 Heating HeatingQC CentralAir  \
0              No          GLQ          Unf    GasA        Ex          Y   
1              Gd          ALQ          Unf    GasA        Ex          Y   
2              Mn          GLQ          Unf    GasA        Ex          Y   
3              No          ALQ          Unf    GasA        Gd          Y   
4              Av          GLQ          Unf    GasA        Ex          Y   
5              No          GLQ          Unf    GasA        Ex          Y   
6              Av          GLQ          Unf    GasA        Ex          Y   
7              Mn          ALQ          BLQ    GasA        Ex          Y   
8              No          Unf          Unf    GasA        Gd          Y   
9              No          GLQ          Unf    GasA        Ex          Y   
10             No          Rec          Unf    GasA        Ex          Y   
11             No          GLQ          Unf    GasA        Ex          Y   
12          

     GarageFinish GarageQual GarageCond PavedDrive PoolQC  Fence MiscFeature  \
0             RFn         TA         TA          Y   None   None        None   
1             RFn         TA         TA          Y   None   None        None   
2             RFn         TA         TA          Y   None   None        None   
3             Unf         TA         TA          Y   None   None        None   
4             RFn         TA         TA          Y   None   None        None   
5             Unf         TA         TA          Y   None  MnPrv        Shed   
6             RFn         TA         TA          Y   None   None        None   
7             RFn         TA         TA          Y   None   None        Shed   
8             Unf         Fa         TA          Y   None   None        None   
9             RFn         Gd         TA          Y   None   None        None   
10            Unf         TA         TA          Y   None   None        None   
11            Fin         TA         TA 

In [5]:
# 對照組 : 標籤編碼 + 梯度提升樹
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = GradientBoostingRegressor()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, train_X, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.7783758280436521
time : 0.5922949314117432 sec


In [6]:
# 均值編碼 + 梯度提升樹
data = pd.concat([df[:train_num], train_Y], axis=1)
for c in df.columns:
    mean_df = data.groupby([c])['SalePrice'].mean().reset_index()
    mean_df.columns = [c, f'{c}_mean']
    data = pd.merge(data, mean_df, on=c, how='left')
    data = data.drop([c] , axis=1)
data = data.drop(['SalePrice'] , axis=1)
estimator = GradientBoostingRegressor()
start = time.time()
print(f'shape : {train_X.shape}')
print(f'score : {cross_val_score(estimator, data, train_Y, cv=5).mean()}')
print(f'time : {time.time() - start} sec')

shape : (1460, 43)
score : 0.8064165648022528
time : 0.5679852962493896 sec


# 作業1
* 請仿照範例，將鐵達尼範例中的類別型特徵改用均值編碼實作一次

# 作業2
* 觀察鐵達尼生存預測中，均值編碼與標籤編碼兩者比較，哪一個效果比較好? 可能的原因是什麼?