# Pipelines

In [20]:
# pandasとtrain_test_splitを呼び出し
import pandas as pd
from sklearn.model_selection import train_test_split
# データ読み込み
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv( 'test.csv', index_col='Id')

# 'SalePrice'のない行を削除
# inplace指定で元のdata frameを置き換える
# inplaceでXそのものを書き換える
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
# 目的変数と説明変数に分割
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# 訓練データと検証データに分ける
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

# 各列の入力データの種類を表示
print(X_train_full.dtypes)
# ユニークが10未満のカテゴリカル変数を出力
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
# 数値のカラムも出力
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

## 必要な列のみを出力
# ユニーク10以下のカテゴリカル変数＋数値変数
my_cols = categorical_cols + numerical_cols
# それぞれ入力用データからコピー
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object


In [21]:
X_train.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,774,0,108,0,0,260,0,0,7,2007
871,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,308,0,0,0,0,0,0,0,8,2009
93,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,432,0,0,44,0,0,0,0,8,2009
818,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,857,150,59,0,0,0,0,0,7,2008
303,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,843,468,81,0,0,0,0,0,1,2006


In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  #欠損値代入のためのSimpleImputer
from sklearn.preprocessing import OneHotEncoder  #カテゴリカル変数処理ようのOneHotEncoder
from sklearn.ensemble import RandomForestRegressor  #ランダムフォレスト
from sklearn.metrics import mean_absolute_error  #評価用のMAE

# 数値変数への準備：定数で欠損値を埋める
numerical_transformer = SimpleImputer(strategy='constant')

# カテゴリカル変数への準備
categorical_transformer = Pipeline(steps=[('Imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
# transformerの定義
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols), 
    ('cat', categorical_transformer, categorical_cols)]
)
# estimaterを定義
model = RandomForestRegressor(n_estimators=100, random_state=0)

# transformerとestimaterを指定：(名前, 関数)のタプルで定義
clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# 訓練データでfit
clf.fit(X_train, y_train)
# 検証データで予測
preds = clf.predict(X_valid)
# MAEを計算
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 17861.780102739725


In [24]:
from sklearn import set_config
set_config(display='diagram')   
clf

# Step 1: Improve the performance

## Part A

Now, it's your turn! In the code cell below, define your own preprocessing steps and random forest model. Fill in values for the following variables:

numerical_transformer
categorical_transformer
model
To pass this part of the exercise, you need only define valid preprocessing steps and a random forest model.

In [29]:
# 欠損値のある列をあらかじめ見てみる
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]
X_train_missing = X_train[cols_with_missing]
X_train_missing

Unnamed: 0_level_0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature,LotFrontage,MasVnrArea,GarageYrBlt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
619,,BrkFace,Ex,TA,Av,GLQ,Unf,SBrkr,Gd,Attchd,Unf,TA,TA,,,,90.0,452.0,2007.0
871,,,TA,TA,No,Unf,Unf,SBrkr,,Detchd,Unf,TA,TA,,,,60.0,0.0,1962.0
93,Grvl,,Gd,TA,No,ALQ,Unf,SBrkr,,Detchd,Unf,TA,TA,,,,80.0,0.0,1921.0
818,,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,,,,,148.0,2002.0
303,,BrkFace,Gd,TA,No,Unf,Unf,SBrkr,TA,Attchd,RFn,TA,TA,,,,118.0,150.0,2001.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,,,,82.0,673.0,1999.0
836,,,Gd,TA,No,BLQ,Unf,SBrkr,,Attchd,Unf,TA,TA,,,,60.0,0.0,1996.0
1217,,,,,,,,SBrkr,,Attchd,Unf,TA,TA,,,,68.0,0.0,1978.0
560,,BrkFace,Gd,TA,Gd,Unf,Unf,SBrkr,TA,Attchd,Fin,TA,TA,,,,,18.0,2003.0


In [30]:
X_train_missing.dtypes

Alley            object
MasVnrType       object
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
LotFrontage     float64
MasVnrArea      float64
GarageYrBlt     float64
dtype: object

In [70]:
from pandas import concat

# 数値変数への準備：定数で欠損値を埋める
numerical_transformer = SimpleImputer(strategy='most_frequent')

lotfrontage_transformer = SimpleImputer(strategy='constant', fill_value = max(X_train['LotFrontage']))

# カテゴリカル変数への準備
lotfrontage_is_na = X_train['LotFrontage'].isna()
lotfrontage_is_na.name = 'lotfrontageisna'
X_train_2 = concat([X_train, lotfrontage_is_na], axis = 1)
print(lotfrontage_is_na)
print(X_train_2)

categorical_transformer = Pipeline(steps=[('Imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
bool_transformer = OneHotEncoder(handle_unknown='ignore')

# transformerの定義
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols), 
        ('cat', categorical_transformer, categorical_cols)
        ]
)


# estimaterを定義
model = RandomForestRegressor(n_estimators=100, random_state=0)

# transformerとestimaterを指定：(名前, 関数)のタプルで定義
clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# 訓練データでfit
clf.fit(X_train_2, y_train)
# 検証データで予測
preds = clf.predict(X_valid)
# MAEを計算
print('MAE:', mean_absolute_error(y_valid, preds))


Id
619     False
871     False
93      False
818      True
303     False
        ...  
764     False
836     False
1217    False
560      True
685     False
Name: lotfrontageisna, Length: 1168, dtype: bool
     MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope  \
Id                                                                              
619        RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
871        RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
93         RL   Pave  Grvl      IR1         HLS    AllPub    Inside       Gtl   
818        RL   Pave   NaN      IR1         Lvl    AllPub   CulDSac       Gtl   
303        RL   Pave   NaN      IR1         Lvl    AllPub    Corner       Gtl   
...       ...    ...   ...      ...         ...       ...       ...       ...   
764        RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
836        RL   Pave   NaN      Reg         Lvl    AllPub    Insi