In [1]:
from datetime import datetime as dt

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb

In [2]:
## 読み込み
df_train = pd.read_csv("../data/train_set.csv", low_memory=False)
df_test = pd.read_csv("../data/test_set.csv", low_memory=False)

In [3]:
## 必要なカラムを抽出、カテゴリ変数はダミー変数化
def extract_columns(df_train, df_test, num_var=None, cat_var=None):
    df_x = pd.concat([df_train, df_test], axis=0)
    df_x = pd.concat([df_x[num_var], pd.get_dummies(df_x[cat_var])], axis=1)
    df_train_x = df_x.iloc[:df_train.shape[0],:]
    df_test_x  = df_x.iloc[df_train.shape[0]:,:]
    return df_train_x, df_test_x

## 学習＆予測する関数を定義
def train_and_predict(x_train, y_train, x_test, mode="lgb"):
    ## モデルを準備
    if mode == "lgb":
        clf = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                 learning_rate=0.05, n_estimators=2000,
                 max_bin = 55, bagging_fraction = 0.8,
                 bagging_freq = 5, feature_fraction = 0.2319,
                 feature_fraction_seed=9, bagging_seed=9,
                 min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
    else:
        clf = xgb.XGBRegressor(
            learning_rate=0.01,
            n_estimators=2000,
            max_depth=6)

    ## 学習
    clf.fit(x_train, np.log(y_train))
    ## 予測
    y_pred = clf.predict(x_test)
    return clf, y_pred

In [4]:
## ResidentialとCondominiumに分割
df_train_resi = df_train.query('SOURCE == "Residential"')
df_train_cond = df_train.query('SOURCE == "Condominium"')
df_test_resi = df_test.query('SOURCE == "Residential"')
df_test_cond = df_test.query('SOURCE == "Condominium"')

In [5]:
## xの準備
num_var_resi = ["BATHRM", "EYB", "LONGITUDE", "FIREPLACES", "SALE_NUM", "CMPLX_NUM", "CENSUS_TRACT", "LIVING_GBA"]
cat_var_resi = ["QUADRANT", "CNDTN", "WARD", "AC", "QUALIFIED"]
df_train_resi_x, df_test_resi_x = extract_columns(df_train_resi.iloc[:,:-1], df_test_resi, num_var=num_var_resi, cat_var=cat_var_resi)

num_var_cond = ["BATHRM", "EYB", "LONGITUDE", "FIREPLACES", "SALE_NUM", "CMPLX_NUM", "CENSUS_TRACT", "LIVING_GBA"]
cat_var_cond = ["QUADRANT", "CNDTN", "WARD", "AC", "QUALIFIED"]
df_train_cond_x, df_test_cond_x = extract_columns(df_train_cond.iloc[:,:-1], df_test_cond, num_var=num_var_cond, cat_var=cat_var_cond)

## yの準備
df_train_resi_y = df_train_resi[["PRICE"]]
df_train_cond_y = df_train_cond[["PRICE"]]

In [9]:
#### 検証用のやつ
## 検証用にデータを準備
x_train_resi, x_valid_resi, y_train_resi, y_valid_resi = train_test_split(df_train_resi_x, df_train_resi_y, test_size=0.2, random_state=123)

## 予測
clf_valid, y_pred = train_and_predict(x_train_resi, y_train_resi, x_valid_resi)#, mode="xgb")

## 結果
mse = mean_squared_error(y_pred, np.log(y_valid_resi))
# msle = mean_squared_log_error(y_pred, np.log(y_valid_resi))

# mse
print("MSE", mse)
# print("MSLE", msle)
# 比較
df_valid_pred = pd.DataFrame(np.exp(y_pred), index=x_valid_resi.index, columns=["PRED"])
# display(pd.concat([y_valid_resi.astype(int), df_valid_pred.astype(int)], axis=1).head(20))

MSE 0.2256695971920721


In [None]:
train_and_predict(x_train)