In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
# 訓練データとテストデータの読み込み
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# 目的変数(SalePriceの抜けている行を落とす)
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# 数字以外のものが入力値になっている列を落とす
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# trainingデータとvalidationデータを分ける
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [2]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,90.0,11694,9,5,2007,2007,452.0,48,0,...,774,0,108,0,0,260,0,0,7,2007
871,20,60.0,6600,5,5,1962,1962,0.0,0,0,...,308,0,0,0,0,0,0,0,8,2009
93,30,80.0,13360,5,7,1921,2006,0.0,713,0,...,432,0,0,44,0,0,0,0,8,2009
818,20,,13265,8,5,2002,2002,148.0,1218,0,...,857,150,59,0,0,0,0,0,7,2008
303,20,118.0,13704,7,5,2001,2002,150.0,0,0,...,843,468,81,0,0,0,0,0,1,2006


# Step 1: Preliminary investigation

In [3]:
# 訓練データのサイズを表示 (行数、列数)
print(X_train.shape)
# 訓練用データの列ごとに欠損値の数を計算
missing_val_count_by_column = (X_train.isnull().sum())
# 欠損値のある列を表示
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [4]:
# ランダムフォレストとMAE呼び出し
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# 訓練用データと学習用データを与えてMAEを返す関数
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# Step 2: Drop columns with missing values

In [14]:
# Fill in the line below: get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Fill in the lines below: drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop columns with missing values):
17837.82570776256


# Step 3:Imputation
## Part A

In [23]:
# SimpleImputerライブラリのインポート
from sklearn.impute import SimpleImputer
# 代入する(ここで何も入力しないと平均値計算になる)
my_imputer = SimpleImputer()
# fit_transformメソッドで代入
# pd.Dataframeでデータフレームにしないといけない(デフォルトではarrayになってしまう)
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# 列名が落ちるので再入力
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
# MAEを表示
print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2 (Imputation):
18062.894611872147


データセットに含まれる欠損値が非常に少ないことから、インピュテーションは列を完全に削除するよりも良いパフォーマンスを示すと予想されます。しかし、我々は、列を削除する方がわずかにパフォーマンスが良いことを確認しました。これは、部分的にはデータセットのノイズに起因すると考えられますが、もう1つの可能性としては、インピュテーションの方法がこのデータセットにあまりマッチしていないことが挙げられます。つまり、平均値を埋める代わりに、すべての欠損値を0に設定する、最も頻繁に遭遇する値を埋める、または他の方法を使用する方がより理にかなっているのかもしれないのです。例えば、GarageYrBlt（車庫の建設年を示す）の列を考えてみましょう。この値が欠けていると、ガレージのない家である可能性があります。この場合、各列の中央値を記入するのがより理にかなっているのでしょうか。それとも、各列の最小値を記入した方が良い結果が得られるのでしょうか？例えば、この列の欠損値を0に設定すると、ひどい結果になる可能性があります。

# Step 4: Generate test predictions
ここからはオリジナルコード

In [30]:
# 訓練データの概要
X_train

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,90.0,11694,9,5,2007,2007,452.0,48,0,...,774,0,108,0,0,260,0,0,7,2007
871,20,60.0,6600,5,5,1962,1962,0.0,0,0,...,308,0,0,0,0,0,0,0,8,2009
93,30,80.0,13360,5,7,1921,2006,0.0,713,0,...,432,0,0,44,0,0,0,0,8,2009
818,20,,13265,8,5,2002,2002,148.0,1218,0,...,857,150,59,0,0,0,0,0,7,2008
303,20,118.0,13704,7,5,2001,2002,150.0,0,0,...,843,468,81,0,0,0,0,0,1,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,60,82.0,9430,8,5,1999,1999,673.0,1163,0,...,856,0,128,0,0,180,0,0,7,2009
836,20,60.0,9600,4,7,1950,1995,0.0,442,0,...,436,290,0,0,0,0,0,0,2,2010
1217,90,68.0,8930,6,5,1978,1978,0.0,0,0,...,539,0,0,0,0,0,0,0,4,2010
560,120,,3196,7,5,2003,2004,18.0,0,0,...,420,143,20,0,0,0,0,0,10,2006


In [32]:
# 各要素がnaかどうかの理論値
X_train.isna()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
871,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
93,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
818,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
303,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
836,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1217,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
560,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [37]:
# 各要素がnaかどうかの理論値
# 列に含まれるかどうか
print(X_train.isna().any())
# 行に含まれるかどうか
print(X_train.isna().any(axis=1))

MSSubClass       False
LotFrontage       True
LotArea          False
OverallQual      False
OverallCond      False
YearBuilt        False
YearRemodAdd     False
MasVnrArea        True
BsmtFinSF1       False
BsmtFinSF2       False
BsmtUnfSF        False
TotalBsmtSF      False
1stFlrSF         False
2ndFlrSF         False
LowQualFinSF     False
GrLivArea        False
BsmtFullBath     False
BsmtHalfBath     False
FullBath         False
HalfBath         False
BedroomAbvGr     False
KitchenAbvGr     False
TotRmsAbvGrd     False
Fireplaces       False
GarageYrBlt       True
GarageCars       False
GarageArea       False
WoodDeckSF       False
OpenPorchSF      False
EnclosedPorch    False
3SsnPorch        False
ScreenPorch      False
PoolArea         False
MiscVal          False
MoSold           False
YrSold           False
dtype: bool
Id
619     False
871     False
93      False
818      True
303     False
        ...  
764     False
836     False
1217    False
560      True
685     False
Len

In [42]:
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]
X_train_missing = X_train[cols_with_missing]
X_train_missing

Unnamed: 0_level_0,LotFrontage,MasVnrArea,GarageYrBlt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
619,90.0,452.0,2007.0
871,60.0,0.0,1962.0
93,80.0,0.0,1921.0
818,,148.0,2002.0
303,118.0,150.0,2001.0
...,...,...,...
764,82.0,673.0,1999.0
836,60.0,0.0,1996.0
1217,68.0,0.0,1978.0
560,,18.0,2003.0


In [43]:
X_train_missing.describe()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt
count,956.0,1162.0,1110.0
mean,69.614017,103.481067,1978.140541
std,22.946069,182.676225,24.877265
min,21.0,0.0,1900.0
25%,59.0,0.0,1961.0
50%,69.0,0.0,1979.0
75%,80.0,167.75,2002.0
max,313.0,1600.0,2010.0


LotFrontage: Linear feet of street connected to property
物件に接続する道路の直線距離
MasVnrArea: Masonry veneer area in square feet
平方フィートで石造りのベニヤの面積
GarageYrBlt: Year garage was built
ガレージが建てられた年

In [48]:
# 最頻値代入にしてみる
my_imputer_freq = SimpleImputer(strategy='most_frequent')
# 代入する
final_X_train = pd.DataFrame(my_imputer_freq.fit_transform(X_train))
final_X_valid = pd.DataFrame(my_imputer_freq.transform(X_valid))
# 列名が落ちるので再入力
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns
# MAEを表示
print("MAE from Approach 3 (Imputation by frequency):")
print(score_dataset(final_X_train, final_X_valid, y_train, y_valid))

MAE from Approach 3 (Imputation by frequency):
17956.065479452056


In [52]:
## この方法で全訓練データを用いて検証
# 全テストデータの欠損値を最頻値で代入
final_X = pd.DataFrame(my_imputer_freq.fit_transform(X))
# 同じ値を用いて代入
final_X_test = pd.DataFrame(my_imputer_freq.transform(X_test))
# modelにはpart1で成績の良かったものを用いる
final_model = RandomForestRegressor(n_estimators = 100, random_state=1, criterion='absolute_error')
# final_Xで学習させる
final_model.fit(final_X, y)
# 予測値計算
preds_test = final_model.predict(final_X_test)
# indexと一緒に出力
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
#書き出し
output.to_csv('submission_20221011.csv', index=False)