In [None]:
import sys
sys.path.append("../")

from datetime import datetime as dt

import numpy as np
import pandas as pd

from src.utils import DataPreprocessor as DP
from src.utils import Postprocessor as PP
from src.models import ManshionBrothers as MB

In [None]:
## データ読み込み
df_train = pd.read_csv("../data/train_set.csv", low_memory=False)
df_test = pd.read_csv("../data/test_set.csv", low_memory=False)

In [None]:
# ## 外れ値除去
# df_train = df_train.drop(df_train[(df_train["PRICE"]>=100000000) | (df_train["PRICE"]<=500)].index)

In [None]:
## 外れ値の修正
df_train["STORIES"] = df_train["STORIES"].replace(826,4)
df_train["STORIES"] = df_train["STORIES"].replace(250,2.5)
df_train["STORIES"] = df_train["STORIES"].replace(25,2.5)
df_train["STORIES"] = df_train["STORIES"].replace(20,2)

df_test["STORIES"] = df_test["STORIES"].replace(275,2.5)
df_test["STORIES"] = df_test["STORIES"].replace(25,2.5)
df_test["STORIES"] = df_test["STORIES"].replace(9,2)

In [None]:
## YR_RMDL, AYBの欠損値補完
df_train.loc[df_train["AYB"].isnull(), "AYB"] = df_train[df_train["AYB"].isnull()]["EYB"]
df_train.loc[df_train["YR_RMDL"].isnull(), "YR_RMDL"] = df_train[df_train["YR_RMDL"].isnull()]["AYB"]

df_test.loc[df_test["AYB"].isnull(), "AYB"] = df_test[df_test["AYB"].isnull()]["EYB"]
df_test.loc[df_test["YR_RMDL"].isnull(), "YR_RMDL"] = df_test[df_test["YR_RMDL"].isnull()]["AYB"]

In [None]:
## STORIESの欠損値補完
# train
story_tmp = df_train[df_train["STORIES"].isnull()]["STYLE"]
story_tmp = story_tmp.replace(" ","", regex=True)
story_tmp = story_tmp.replace("Story","", regex=True)
story_tmp = story_tmp.replace("Fin","", regex=True)
story_tmp = story_tmp.replace("Unfin","", regex=True)
story_tmp = story_tmp.astype(np.float64)
df_train.loc[df_train["STORIES"].isnull(), "STORIES"] = story_tmp

# test
story_tmp = df_test[df_test["STORIES"].isnull()]["STYLE"]
story_tmp = story_tmp.replace(" ","", regex=True)
story_tmp = story_tmp.replace("Story","", regex=True)
story_tmp = story_tmp.replace("Fin","", regex=True)
story_tmp = story_tmp.replace("Unfin","", regex=True)
story_tmp = story_tmp.astype(np.float64)
df_test.loc[df_test["STORIES"].isnull(), "STORIES"] = story_tmp

In [None]:
## その他の欠損値処理
df_train = df_train.drop(df_train[df_train["Id"]==26034].index, axis=0)
df_test.loc[df_test["Id"]==76393, "KITCHENS"] = 1

In [None]:
## 新しい特徴量
df_train["SALE_YEAR"] = pd.to_datetime(df_train["SALEDATE"]).dt.year
df_train["SALE2AYB"] = df_train["SALE_YEAR"] - df_train["AYB"]
df_train["SALE2RMDL"] = df_train["SALE_YEAR"] - df_train["YR_RMDL"]

# カラム並べ替え
cols = list(df_train.columns)
cols.remove("PRICE")
cols.append("PRICE")
df_train = df_train[cols]

df_test["SALE_YEAR"] = pd.to_datetime(df_test["SALEDATE"]).dt.year
df_test["SALE2AYB"] = df_test["SALE_YEAR"] - df_test["AYB"]
df_test["SALE2RMDL"] = df_test["SALE_YEAR"] - df_test["YR_RMDL"]

In [None]:
## 変数選択
nc_resi = [
'AYB',
'BATHRM',
'BEDRM',
'BLDG_NUM',
# 'CENSUS_TRACT',
'EYB',
'FIREPLACES',
'GBA',
'HF_BATHRM',
'KITCHENS',
'LANDAREA',
'LATITUDE',
'LONGITUDE',
'NUM_UNITS',
'ROOMS',
'STORIES',
'YR_RMDL',
"SALE_YEAR",
"SALE2AYB",
"SALE2RMDL"
]
cc_resi = ['AC',
'ASSESSMENT_NBHD',
'CNDTN',
'EXTWALL',
'GRADE',
'HEAT',
'INTWALL',
'QUALIFIED',
'ROOF',
'STRUCT',
# 'STYLE',
'WARD',
'SALE_NUM',
'USECODE',
'ZIPCODE'
]

nc_cond = [
'AYB',
'BATHRM',
'BEDRM',
# 'CENSUS_TRACT',
# 'CMPLX_NUM',
'EYB',
'FIREPLACES',
'HF_BATHRM',
'LANDAREA',
'LATITUDE',
'LIVING_GBA',
'LONGITUDE',
'ROOMS',
# 'X',
# 'Y',
'YR_RMDL',
"SALE_YEAR",
"SALE2AYB",
"SALE2RMDL"
]

cc_cond = ['AC',
'ASSESSMENT_NBHD',
'HEAT',
'QUADRANT',
'QUALIFIED',
'WARD',
'SALE_NUM',
'USECODE',
'ZIPCODE'
]

## 前処理＠Residential
# 前処理
dp_resi = DP()
dp_resi.load_dataframe(df_train.query('SOURCE=="Residential"'), df_test.query('SOURCE=="Residential"'))
dp_resi.set_cols(num_cols=nc_resi, cat_cols=cc_resi)
dp_resi.compile()

## 前処理＠Condominium
# 前処理
dp_cond = DP()
dp_cond.load_dataframe(df_train.query('SOURCE=="Condominium"'), df_test.query('SOURCE=="Condominium"'))
dp_cond.set_cols(num_cols=nc_cond, cat_cols=cc_cond)
dp_cond.compile()

## モデルの準備＠Residential
mb_resi = MB()
mb_resi.compile(models=["LGB", "XGB"])
# mb_resi.compile(models=[["LGB",{"n_estimators":10000}]])
# mb_resi.compile(models=[["LGB",{"n_estimators":10000}], "XGB"])

## モデルの準備＠Condominium
mb_cond = MB()
mb_cond.compile(models=["LGB", "XGB"])
# mb_cond.compile(models=[["LGB",{"n_estimators":10000}]])
# mb_cond.compile(models=[["LGB",{"n_estimators":10000}], "XGB"])

In [None]:
# ## box-cox変換？
# from scipy.special import boxcox1p

# skewed_features = ['LANDAREA',
# 'KITCHENS',
# 'NUM_UNITS',
# 'GBA',
# 'FIREPLACES',
# 'ROOMS',
# 'BEDRM',
# 'EYB',
# 'BATHRM',
# 'AYB',
# 'HF_BATHRM',
# 'LONGITUDE',
# 'LATITUDE',
# 'YR_RMDL']

# lam = 0.15
# for feat in skewed_features:
#     dp_resi.x_train[feat] = boxcox1p(dp_resi.x_train[feat], lam)
#     dp_resi.x_test[feat] = boxcox1p(dp_resi.x_test[feat], lam)


In [None]:
## 検証＠Residential
x_tr_resi, x_vl_resi, y_tr_resi, y_vl_resi = dp_resi.get_validation_data()
# 学習＆予測
mb_resi.train(x_tr_resi, np.log1p(y_tr_resi))
y_pre_resi = mb_resi.predict(x_vl_resi)
# 評価
print("Residential")
_ = mb_resi.evaluate(y_pre_resi, np.log1p(y_vl_resi))

print("~"*30)

## 検証＠Condominium
x_tr_cond, x_vl_cond, y_tr_cond, y_vl_cond = dp_cond.get_validation_data()
# 学習＆予測
mb_cond.train(x_tr_cond, np.log1p(y_tr_cond))
y_pre_cond = mb_cond.predict(x_vl_cond)
# 評価
print("Condominium")
_ = mb_cond.evaluate(y_pre_cond, np.log1p(y_vl_cond))

In [None]:
## 前処理＠Residential
# 前処理
dp_resi = DP()
dp_resi.load_dataframe(df_train.query('SOURCE=="Residential"'), df_test.query('SOURCE=="Residential"'))
dp_resi.set_cols(num_cols=nc_resi, cat_cols=cc_resi)
dp_resi.compile()

## 前処理＠Condominium
# 前処理
dp_cond = DP()
dp_cond.load_dataframe(df_train.query('SOURCE=="Condominium"'), df_test.query('SOURCE=="Condominium"'))
dp_cond.set_cols(num_cols=nc_cond, cat_cols=cc_cond)
dp_cond.compile()

## モデルの準備＠Residential
mb_resi = MB()
mb_resi.compile(models=["LGB","XGB"])

## モデルの準備＠Condominium
mb_cond = MB()
mb_cond.compile(models=["LGB","XGB"])

In [None]:
## 提出用に学習
mb_resi.train(dp_resi.x_train, np.log1p(dp_resi.y_train))
mb_cond.train(dp_cond.x_train, np.log1p(dp_cond.y_train))

## 提出用に予測
y_pred_resi = mb_resi.predict(dp_resi.x_test)
y_pred_cond = mb_cond.predict(dp_cond.x_test)

## 提出用csv作成
submit_resi = PP.make_submit_csv(np.expm1(y_pred_resi), dp_resi.x_test)
submit_cond = PP.make_submit_csv(np.expm1(y_pred_cond), dp_cond.x_test)
submit = pd.concat([submit_resi, submit_cond], axis=0)
submit["Id"] = df_test["Id"]
submit = submit[["Id", "PRICE"]]

## 結果保存
fn = dt.now().strftime("%Y%m%d_%H%M%S") + ".csv"
submit.to_csv("../data/submit/"+fn, header=True, index=False)