In [33]:
import pandas as pd
import xgboost as xgb
import category_encoders as ce
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import time
import datetime

In [41]:
dt = datetime.datetime.now()

In [18]:
"""
1.与えられたデータを使えるように読み込む
"""
#　sample_submissionの読み込み
sample_submission = pd.read_csv(r"data/sample_submission.csv", dtype = {'ID':'object', '取引価格（総額）_log':'float'})

# test.csvの読み込み
test_df = pd.read_csv('data/test.csv')

# train.csvの読み込み
train_files = glob.glob('data/train/*.csv') # 条件に当てはまるすべてのパスを抜き取り、リストに格納する
train_df_list = [] # 空のリストを作成し、読み込んだdfを格納していく(最終的に結合するため)
for file in train_files: #　それぞれのファイルに対して
    df = pd.read_csv(file) # ファイルを読み込み
    train_df_list.append(df) #　リストに格納
train_df = pd.concat(train_df_list) #　一つのまとまったデータとしてほしいので結合する。

In [19]:
'''
2.データがどのようなものなのか確認する
'''
# 各カラムに含まれている値を表示するための関数
def check_df_value(df):
    print(train_df.columns)
    print('======================================\n')
    for col in train_df.columns:
        print('{}:\n{}'.format(col, train_df[col].value_counts().keys()))
        print('======================================\n')
check_df_value(train_df)

Index(['ID', '種類', '地域', '市区町村コード', '都道府県名', '市区町村名', '地区名', '最寄駅：名称',
       '最寄駅：距離（分）', '間取り', '面積（㎡）', '土地の形状', '間口', '延床面積（㎡）', '建築年', '建物の構造',
       '用途', '今後の利用目的', '前面道路：方位', '前面道路：種類', '前面道路：幅員（ｍ）', '都市計画', '建ぺい率（％）',
       '容積率（％）', '取引時点', '改装', '取引の事情等', '取引価格（総額）_log'],
      dtype='object')

ID:
Int64Index([44040192, 27022559, 12161066,  7033332, 28161251, 25003237,
            28155110, 12036305, 25027817, 27131114,
            ...
            13297654, 14014480, 13299703, 29001722,  8044541, 29009918,
            15038468, 13444115, 13194462, 22032378],
           dtype='int64', length=677392)

種類:
Index(['中古マンション等'], dtype='object')

地域:
Float64Index([], dtype='float64')

市区町村コード:
Int64Index([13111, 13108, 13112, 13103, 13104, 13109, 13119, 13102, 13120,
            13115,
            ...
             8564, 40604, 21302, 40341, 15108, 40213, 34309, 40212, 40345,
            27147],
           dtype='int64', length=619)

都道府県名:
Index(['東京都', '神奈川県', '大阪府', '兵庫県', '埼玉県

In [20]:
"""
3.データの前処理

訓練用データだけでなくテスト用データも同じように処理する必要があるためtrainとtestは結合して整形する。
"""
train_df['test'] = 0 # test用データでないので0
test_df['test'] = 1 # test用データなので1
df = pd.concat([train_df, test_df]) #　結合

# データ前処理の関数。dfを入れたら前処理を経たdfが出てくる
def data_pre(df):
    nonnull_list = [] #　空のリスト作成
    for col in df.columns: # dfのカラムの数だけ繰り返し
        nonnull = df[col].count() #nonnull = あるカラムの個数をカウントしている
        if nonnull == 0: # もしdf[col].count() == 0なら
            nonnull_list.append(col) # nonnull_listにそのカラムを追加する
    df = df.drop(nonnull_list, axis=1) # nonnull_listに含まれているすべてのカラムを削除する（必要ないため）

    df = df.drop("市区町村名", axis=1) # 評価にあまり影響しないと考え削除(ホントは影響するが、うまく処理するのが難しい)

    df = df.drop("種類", axis=1) # 評価にあまり影響しないと考え削除(ホントは影響するが、うまく処理するのが難しい)

    dis = {
        "30分?60分":45,
        "1H?1H30":75,
        "2H?":120,
        "1H30?2H":105
    } # 文字情報より数値データのほうが分析しやすいためこのような対応の辞書を作成
    df["最寄駅：距離（分）"] = df["最寄駅：距離（分）"].replace(dis).astype(float) #辞書形式の対応表に基づいて置換

    df["面積（㎡）"] = df["面積（㎡）"].replace("2000㎡以上", 2000).astype(float) # "2000以上"を一律2000に置換


    built_list = {} # 建築年数を数値で表示するための置換リストの準備
    for i in df["建築年"].value_counts().keys():
        if "平成" in i:
            num = float(i.split("平成")[-1].split("年")[0])
            year = 33 - num
        if "令和" in i:
            num = float(i.split("令和")[-1].split("年")[0])
            year = 3 - num
        if "昭和" in i:
            num = float(i.split("昭和")[-1].split("年")[0])
            year = 96 - num
        built_list[i] = year
    built_list["戦前"] = 76
    df["建築年"] = df["建築年"].replace(built_list) # 作成した辞書をもとに置換

    year = {
        "年第１四半期": ".25",
        "年第２四半期": ".50",
        "年第３四半期": ".75",
        "年第４四半期": ".99"
    }
    year_list = {}
    for i in df["取引時点"].value_counts().keys():
        for k, j in year.items():
            if k in i:
                year_rep = i.replace(k, j)
        year_list[i] = year_rep
    df["取引時点"] = df["取引時点"].replace(year_list).astype(float)
    
    cols = ["都道府県名", "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]
    ce_df = ce.OrdinalEncoder(cols=cols, handle_unknown='impute') # カテゴリーエンコーダーで数値でないものを分析しやすいように変換
    df = ce_df.fit_transform(df) 
    
    return df
    
df = data_pre(df) # 前処理の実行
train_df = df[df['test'] == 0] #train, testの分割
test_df = df[df['test'] == 1]

train_df = train_df.drop("test", axis = 1) #  不必要なカラムの削除
test_df = test_df.drop(["test", "取引価格（総額）_log"], axis = 1)

In [21]:
'''
4.データがどのようなものなのか確認する
'''
check_df_value(train_df)

Index(['ID', '市区町村コード', '都道府県名', '地区名', '最寄駅：名称', '最寄駅：距離（分）', '間取り', '面積（㎡）',
       '建築年', '建物の構造', '用途', '今後の利用目的', '都市計画', '建ぺい率（％）', '容積率（％）', '取引時点',
       '改装', '取引の事情等', '取引価格（総額）_log'],
      dtype='object')

ID:
Int64Index([44040192, 27022559, 12161066,  7033332, 28161251, 25003237,
            28155110, 12036305, 25027817, 27131114,
            ...
            13297654, 14014480, 13299703, 29001722,  8044541, 29009918,
            15038468, 13444115, 13194462, 22032378],
           dtype='int64', length=677392)

市区町村コード:
Int64Index([13111, 13108, 13112, 13103, 13104, 13109, 13119, 13102, 13120,
            13115,
            ...
             8564, 40604, 21302, 40341, 15108, 40213, 34309, 40212, 40345,
            27147],
           dtype='int64', length=619)

都道府県名:
Int64Index([44, 36, 27, 37, 42, 47, 10,  1, 34, 26, 38, 11, 15, 33, 35, 19, 13,
             8, 28,  3, 40, 18, 16,  6, 45,  9,  5,  4, 24, 39, 32, 17, 31, 14,
             7, 29, 23, 43, 30, 12,  2, 20, 21, 46

In [6]:
'''
5.複数のモデルを作成する
'''

col = "取引価格（総額）_log"
df_X = train_df.drop(col, axis = 1)
y = train_df[col]

def return_n_models_xgboost_KFold(df_X, y, n_split):
    # 学習データとテストデータに分ける　->すべてがひとかたまりの場合この操作をする
    X_train, X_test, y_train, y_test = train_test_split(df_X, y,
                                                        test_size=0.2,
                                                        random_state=0)
    # 学習データを、学習用と検証用に分ける
    X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train,
                                                        test_size=0.2,
                                                        random_state=2)

    # データを格納する
    xgb_test = xgb.DMatrix(X_test, label=y_test)
    # fold CVモデルの学習,モデルを保存するリストの初期化
    models = []
    #accuracyを保存するNumPy配列の初期化
    accuracies = np.array([])

    # 学習データの数だけの数列（0行から最終行まで連番）
    row_no_list = list(range(len(y_train)))

    # KFoldクラスをインスタンス化（これを使って5分割する）
    K_fold = KFold(n_splits=n_split, shuffle=True,  random_state=42)

    # KFoldクラスで分割した回数だけ実行（ここでは5回）
    for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y_train):
        # ilocで取り出す行を指定
        X_train_cv = X_train.iloc[train_cv_no, :]
        y_train_cv = pd.Series(y_train).iloc[train_cv_no]
        X_eval_cv = X_train.iloc[eval_cv_no, :]
        y_eval_cv = pd.Series(y_train).iloc[eval_cv_no]

        # 学習用
        xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)

        # 検証用
        xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)


        # パラメータを設定
        xgb_params = {
            "objective": "reg:squarederror",
            'eval_metric': "mae"
            }

        # 学習
        evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ
        evaluation_results = {}                            # 学習の経過を保存する箱
        bst = xgb.train(xgb_params,                        # 上記で設定したパラメーター
                        xgb_train,                         # 使用するデータセット
                        num_boost_round=500,               # 学習の回数
                        early_stopping_rounds=10,          # アーリーストッピング
                        evals=evals,                       # 上記で設定した検証用データ
                        evals_result=evaluation_results,   # 上記で設定した箱
                        verbose_eval=1                     # 学習の経過の表示(非表示)
                        ) 

        # テストデータで予測する
        y_pred = bst.predict(xgb_test, ntree_limit=bst.best_ntree_limit)    

        # Accuracy を計算する
        accuracy = np.sqrt(mean_squared_error(y_test, y_pred))
        print('accuracy:', accuracy)

        # 学習が終わったモデルをリストに入れておく
        models.append(bst) 

        # 学習結果をNumPy配列に入れておく
        accuracies = np.append(accuracies, accuracy)

    print('accuracy_ave: ', np.mean(accuracies))
    return models

xgb_models = return_n_models_xgboost_KFold(df_X, y, 5)

[0]	train-mae:4.70625	eval-mae:4.70462
[1]	train-mae:3.29465	eval-mae:3.29359
[2]	train-mae:2.30648	eval-mae:2.30579
[3]	train-mae:1.61481	eval-mae:1.61428
[4]	train-mae:1.13099	eval-mae:1.13078
[5]	train-mae:0.79405	eval-mae:0.79412
[6]	train-mae:0.56010	eval-mae:0.56012
[7]	train-mae:0.39876	eval-mae:0.39858
[8]	train-mae:0.28993	eval-mae:0.28966
[9]	train-mae:0.21915	eval-mae:0.21899
[10]	train-mae:0.17492	eval-mae:0.17479
[11]	train-mae:0.14869	eval-mae:0.14850
[12]	train-mae:0.13374	eval-mae:0.13354
[13]	train-mae:0.12372	eval-mae:0.12359
[14]	train-mae:0.11841	eval-mae:0.11836
[15]	train-mae:0.11510	eval-mae:0.11514
[16]	train-mae:0.11298	eval-mae:0.11305
[17]	train-mae:0.11148	eval-mae:0.11164
[18]	train-mae:0.11053	eval-mae:0.11070
[19]	train-mae:0.10942	eval-mae:0.10959
[20]	train-mae:0.10866	eval-mae:0.10887
[21]	train-mae:0.10757	eval-mae:0.10784
[22]	train-mae:0.10702	eval-mae:0.10732
[23]	train-mae:0.10612	eval-mae:0.10644
[24]	train-mae:0.10569	eval-mae:0.10606
[25]	train

[203]	train-mae:0.08225	eval-mae:0.08622
[204]	train-mae:0.08218	eval-mae:0.08617
[205]	train-mae:0.08215	eval-mae:0.08614
[206]	train-mae:0.08211	eval-mae:0.08611
[207]	train-mae:0.08206	eval-mae:0.08607
[208]	train-mae:0.08201	eval-mae:0.08604
[209]	train-mae:0.08198	eval-mae:0.08603
[210]	train-mae:0.08195	eval-mae:0.08601
[211]	train-mae:0.08193	eval-mae:0.08601
[212]	train-mae:0.08190	eval-mae:0.08599
[213]	train-mae:0.08184	eval-mae:0.08596
[214]	train-mae:0.08179	eval-mae:0.08592
[215]	train-mae:0.08176	eval-mae:0.08591
[216]	train-mae:0.08172	eval-mae:0.08589
[217]	train-mae:0.08170	eval-mae:0.08588
[218]	train-mae:0.08169	eval-mae:0.08588
[219]	train-mae:0.08161	eval-mae:0.08583
[220]	train-mae:0.08157	eval-mae:0.08580
[221]	train-mae:0.08153	eval-mae:0.08578
[222]	train-mae:0.08145	eval-mae:0.08573
[223]	train-mae:0.08142	eval-mae:0.08571
[224]	train-mae:0.08134	eval-mae:0.08566
[225]	train-mae:0.08128	eval-mae:0.08561
[226]	train-mae:0.08122	eval-mae:0.08558
[227]	train-mae:

[403]	train-mae:0.07554	eval-mae:0.08255
[404]	train-mae:0.07551	eval-mae:0.08254
[405]	train-mae:0.07548	eval-mae:0.08252
[406]	train-mae:0.07545	eval-mae:0.08250
[407]	train-mae:0.07542	eval-mae:0.08249
[408]	train-mae:0.07540	eval-mae:0.08249
[409]	train-mae:0.07539	eval-mae:0.08249
[410]	train-mae:0.07537	eval-mae:0.08248
[411]	train-mae:0.07535	eval-mae:0.08248
[412]	train-mae:0.07532	eval-mae:0.08247
[413]	train-mae:0.07528	eval-mae:0.08244
[414]	train-mae:0.07526	eval-mae:0.08243
[415]	train-mae:0.07521	eval-mae:0.08240
[416]	train-mae:0.07518	eval-mae:0.08239
[417]	train-mae:0.07515	eval-mae:0.08237
[418]	train-mae:0.07513	eval-mae:0.08236
[419]	train-mae:0.07512	eval-mae:0.08237
[420]	train-mae:0.07511	eval-mae:0.08236
[421]	train-mae:0.07509	eval-mae:0.08235
[422]	train-mae:0.07508	eval-mae:0.08235
[423]	train-mae:0.07505	eval-mae:0.08233
[424]	train-mae:0.07502	eval-mae:0.08232
[425]	train-mae:0.07500	eval-mae:0.08231
[426]	train-mae:0.07498	eval-mae:0.08231
[427]	train-mae:



accuracy: 0.13334489893838478
[0]	train-mae:4.70609	eval-mae:4.70556
[1]	train-mae:3.29452	eval-mae:3.29438
[2]	train-mae:2.30642	eval-mae:2.30645
[3]	train-mae:1.61480	eval-mae:1.61494
[4]	train-mae:1.13105	eval-mae:1.13122
[5]	train-mae:0.79415	eval-mae:0.79440
[6]	train-mae:0.56015	eval-mae:0.56032
[7]	train-mae:0.39894	eval-mae:0.39908
[8]	train-mae:0.29008	eval-mae:0.29016
[9]	train-mae:0.21899	eval-mae:0.21918
[10]	train-mae:0.17506	eval-mae:0.17550
[11]	train-mae:0.14871	eval-mae:0.14925
[12]	train-mae:0.13400	eval-mae:0.13460
[13]	train-mae:0.12532	eval-mae:0.12599
[14]	train-mae:0.11986	eval-mae:0.12059
[15]	train-mae:0.11613	eval-mae:0.11689
[16]	train-mae:0.11389	eval-mae:0.11461
[17]	train-mae:0.11247	eval-mae:0.11323
[18]	train-mae:0.11158	eval-mae:0.11236
[19]	train-mae:0.11059	eval-mae:0.11140
[20]	train-mae:0.10880	eval-mae:0.10962
[21]	train-mae:0.10736	eval-mae:0.10823
[22]	train-mae:0.10655	eval-mae:0.10739
[23]	train-mae:0.10621	eval-mae:0.10705
[24]	train-mae:0.105

[202]	train-mae:0.08192	eval-mae:0.08624
[203]	train-mae:0.08182	eval-mae:0.08613
[204]	train-mae:0.08178	eval-mae:0.08611
[205]	train-mae:0.08170	eval-mae:0.08604
[206]	train-mae:0.08167	eval-mae:0.08604
[207]	train-mae:0.08160	eval-mae:0.08600
[208]	train-mae:0.08157	eval-mae:0.08598
[209]	train-mae:0.08155	eval-mae:0.08598
[210]	train-mae:0.08151	eval-mae:0.08596
[211]	train-mae:0.08144	eval-mae:0.08591
[212]	train-mae:0.08138	eval-mae:0.08586
[213]	train-mae:0.08132	eval-mae:0.08583
[214]	train-mae:0.08130	eval-mae:0.08583
[215]	train-mae:0.08122	eval-mae:0.08575
[216]	train-mae:0.08117	eval-mae:0.08572
[217]	train-mae:0.08108	eval-mae:0.08565
[218]	train-mae:0.08103	eval-mae:0.08563
[219]	train-mae:0.08099	eval-mae:0.08561
[220]	train-mae:0.08096	eval-mae:0.08560
[221]	train-mae:0.08094	eval-mae:0.08559
[222]	train-mae:0.08090	eval-mae:0.08556
[223]	train-mae:0.08088	eval-mae:0.08556
[224]	train-mae:0.08084	eval-mae:0.08553
[225]	train-mae:0.08082	eval-mae:0.08551
[226]	train-mae:

[402]	train-mae:0.07546	eval-mae:0.08281
[403]	train-mae:0.07543	eval-mae:0.08280
[404]	train-mae:0.07540	eval-mae:0.08279
[405]	train-mae:0.07538	eval-mae:0.08279
[406]	train-mae:0.07534	eval-mae:0.08276
[407]	train-mae:0.07531	eval-mae:0.08273
[408]	train-mae:0.07530	eval-mae:0.08273
[409]	train-mae:0.07527	eval-mae:0.08271
[410]	train-mae:0.07525	eval-mae:0.08270
[411]	train-mae:0.07523	eval-mae:0.08270
[412]	train-mae:0.07522	eval-mae:0.08270
[413]	train-mae:0.07519	eval-mae:0.08269
[414]	train-mae:0.07517	eval-mae:0.08269
[415]	train-mae:0.07516	eval-mae:0.08269
[416]	train-mae:0.07514	eval-mae:0.08268
[417]	train-mae:0.07510	eval-mae:0.08265
[418]	train-mae:0.07508	eval-mae:0.08265
[419]	train-mae:0.07505	eval-mae:0.08264
[420]	train-mae:0.07502	eval-mae:0.08263
[421]	train-mae:0.07500	eval-mae:0.08261
[422]	train-mae:0.07497	eval-mae:0.08260
[423]	train-mae:0.07493	eval-mae:0.08257
[424]	train-mae:0.07491	eval-mae:0.08257
[425]	train-mae:0.07489	eval-mae:0.08255
[426]	train-mae:



accuracy: 0.13336142270459128
[0]	train-mae:4.70575	eval-mae:4.70686
[1]	train-mae:3.29432	eval-mae:3.29496
[2]	train-mae:2.30632	eval-mae:2.30664
[3]	train-mae:1.61473	eval-mae:1.61494
[4]	train-mae:1.13099	eval-mae:1.13093
[5]	train-mae:0.79412	eval-mae:0.79387
[6]	train-mae:0.56012	eval-mae:0.55990
[7]	train-mae:0.39870	eval-mae:0.39855
[8]	train-mae:0.28992	eval-mae:0.28974
[9]	train-mae:0.21920	eval-mae:0.21901
[10]	train-mae:0.17451	eval-mae:0.17444
[11]	train-mae:0.14824	eval-mae:0.14829
[12]	train-mae:0.13295	eval-mae:0.13317
[13]	train-mae:0.12448	eval-mae:0.12478
[14]	train-mae:0.11782	eval-mae:0.11815
[15]	train-mae:0.11453	eval-mae:0.11490
[16]	train-mae:0.11223	eval-mae:0.11265
[17]	train-mae:0.11054	eval-mae:0.11101
[18]	train-mae:0.10954	eval-mae:0.11006
[19]	train-mae:0.10869	eval-mae:0.10924
[20]	train-mae:0.10806	eval-mae:0.10864
[21]	train-mae:0.10746	eval-mae:0.10806
[22]	train-mae:0.10688	eval-mae:0.10752
[23]	train-mae:0.10607	eval-mae:0.10674
[24]	train-mae:0.105

[202]	train-mae:0.08167	eval-mae:0.08604
[203]	train-mae:0.08157	eval-mae:0.08596
[204]	train-mae:0.08155	eval-mae:0.08594
[205]	train-mae:0.08152	eval-mae:0.08594
[206]	train-mae:0.08148	eval-mae:0.08593
[207]	train-mae:0.08144	eval-mae:0.08591
[208]	train-mae:0.08139	eval-mae:0.08589
[209]	train-mae:0.08137	eval-mae:0.08589
[210]	train-mae:0.08136	eval-mae:0.08588
[211]	train-mae:0.08135	eval-mae:0.08588
[212]	train-mae:0.08128	eval-mae:0.08582
[213]	train-mae:0.08123	eval-mae:0.08578
[214]	train-mae:0.08120	eval-mae:0.08577
[215]	train-mae:0.08114	eval-mae:0.08573
[216]	train-mae:0.08110	eval-mae:0.08569
[217]	train-mae:0.08106	eval-mae:0.08567
[218]	train-mae:0.08102	eval-mae:0.08565
[219]	train-mae:0.08097	eval-mae:0.08563
[220]	train-mae:0.08093	eval-mae:0.08561
[221]	train-mae:0.08091	eval-mae:0.08561
[222]	train-mae:0.08089	eval-mae:0.08560
[223]	train-mae:0.08087	eval-mae:0.08559
[224]	train-mae:0.08085	eval-mae:0.08559
[225]	train-mae:0.08082	eval-mae:0.08558
[226]	train-mae:

[402]	train-mae:0.07546	eval-mae:0.08289
[403]	train-mae:0.07544	eval-mae:0.08288
[404]	train-mae:0.07542	eval-mae:0.08288
[405]	train-mae:0.07539	eval-mae:0.08287
[406]	train-mae:0.07538	eval-mae:0.08286
[407]	train-mae:0.07535	eval-mae:0.08284
[408]	train-mae:0.07534	eval-mae:0.08284
[409]	train-mae:0.07532	eval-mae:0.08284
[410]	train-mae:0.07528	eval-mae:0.08282
[411]	train-mae:0.07526	eval-mae:0.08281
[412]	train-mae:0.07522	eval-mae:0.08279
[413]	train-mae:0.07519	eval-mae:0.08277
[414]	train-mae:0.07517	eval-mae:0.08277
[415]	train-mae:0.07515	eval-mae:0.08277
[416]	train-mae:0.07513	eval-mae:0.08276
[417]	train-mae:0.07509	eval-mae:0.08274
[418]	train-mae:0.07508	eval-mae:0.08274
[419]	train-mae:0.07507	eval-mae:0.08274
[420]	train-mae:0.07504	eval-mae:0.08271
[421]	train-mae:0.07504	eval-mae:0.08271
[422]	train-mae:0.07504	eval-mae:0.08271
[423]	train-mae:0.07501	eval-mae:0.08270
[424]	train-mae:0.07498	eval-mae:0.08268
[425]	train-mae:0.07497	eval-mae:0.08268
[426]	train-mae:



accuracy: 0.1337541727566486
[0]	train-mae:4.70589	eval-mae:4.70593
[1]	train-mae:3.29439	eval-mae:3.29424
[2]	train-mae:2.30631	eval-mae:2.30610
[3]	train-mae:1.61468	eval-mae:1.61449
[4]	train-mae:1.13094	eval-mae:1.13081
[5]	train-mae:0.79400	eval-mae:0.79383
[6]	train-mae:0.56002	eval-mae:0.55989
[7]	train-mae:0.39861	eval-mae:0.39851
[8]	train-mae:0.28988	eval-mae:0.28995
[9]	train-mae:0.21872	eval-mae:0.21890
[10]	train-mae:0.17469	eval-mae:0.17492
[11]	train-mae:0.14848	eval-mae:0.14875
[12]	train-mae:0.13339	eval-mae:0.13383
[13]	train-mae:0.12443	eval-mae:0.12498
[14]	train-mae:0.11936	eval-mae:0.11992
[15]	train-mae:0.11632	eval-mae:0.11691
[16]	train-mae:0.11433	eval-mae:0.11497
[17]	train-mae:0.11288	eval-mae:0.11350
[18]	train-mae:0.11153	eval-mae:0.11220
[19]	train-mae:0.11042	eval-mae:0.11115
[20]	train-mae:0.10930	eval-mae:0.11003
[21]	train-mae:0.10826	eval-mae:0.10899
[22]	train-mae:0.10762	eval-mae:0.10836
[23]	train-mae:0.10716	eval-mae:0.10795
[24]	train-mae:0.1059

[202]	train-mae:0.08182	eval-mae:0.08647
[203]	train-mae:0.08179	eval-mae:0.08646
[204]	train-mae:0.08177	eval-mae:0.08645
[205]	train-mae:0.08173	eval-mae:0.08642
[206]	train-mae:0.08167	eval-mae:0.08638
[207]	train-mae:0.08162	eval-mae:0.08636
[208]	train-mae:0.08156	eval-mae:0.08631
[209]	train-mae:0.08147	eval-mae:0.08623
[210]	train-mae:0.08141	eval-mae:0.08618
[211]	train-mae:0.08134	eval-mae:0.08613
[212]	train-mae:0.08130	eval-mae:0.08611
[213]	train-mae:0.08127	eval-mae:0.08611
[214]	train-mae:0.08124	eval-mae:0.08609
[215]	train-mae:0.08123	eval-mae:0.08609
[216]	train-mae:0.08120	eval-mae:0.08606
[217]	train-mae:0.08117	eval-mae:0.08604
[218]	train-mae:0.08113	eval-mae:0.08603
[219]	train-mae:0.08110	eval-mae:0.08600
[220]	train-mae:0.08106	eval-mae:0.08598
[221]	train-mae:0.08101	eval-mae:0.08594
[222]	train-mae:0.08095	eval-mae:0.08589
[223]	train-mae:0.08092	eval-mae:0.08589
[224]	train-mae:0.08088	eval-mae:0.08586
[225]	train-mae:0.08084	eval-mae:0.08584
[226]	train-mae:

[402]	train-mae:0.07544	eval-mae:0.08322
[403]	train-mae:0.07541	eval-mae:0.08319
[404]	train-mae:0.07539	eval-mae:0.08318
[405]	train-mae:0.07537	eval-mae:0.08318
[406]	train-mae:0.07533	eval-mae:0.08314
[407]	train-mae:0.07530	eval-mae:0.08314
[408]	train-mae:0.07528	eval-mae:0.08312
[409]	train-mae:0.07525	eval-mae:0.08311
[410]	train-mae:0.07522	eval-mae:0.08310
[411]	train-mae:0.07519	eval-mae:0.08308
[412]	train-mae:0.07517	eval-mae:0.08307
[413]	train-mae:0.07514	eval-mae:0.08306
[414]	train-mae:0.07511	eval-mae:0.08305
[415]	train-mae:0.07509	eval-mae:0.08305
[416]	train-mae:0.07507	eval-mae:0.08305
[417]	train-mae:0.07505	eval-mae:0.08304
[418]	train-mae:0.07502	eval-mae:0.08304
[419]	train-mae:0.07499	eval-mae:0.08303
[420]	train-mae:0.07498	eval-mae:0.08303
[421]	train-mae:0.07495	eval-mae:0.08302
[422]	train-mae:0.07492	eval-mae:0.08300
[423]	train-mae:0.07489	eval-mae:0.08299
[424]	train-mae:0.07487	eval-mae:0.08298
[425]	train-mae:0.07484	eval-mae:0.08298
[426]	train-mae:



accuracy: 0.13381968043593534
[0]	train-mae:4.70580	eval-mae:4.70687
[1]	train-mae:3.29431	eval-mae:3.29514
[2]	train-mae:2.30624	eval-mae:2.30686
[3]	train-mae:1.61465	eval-mae:1.61506
[4]	train-mae:1.13094	eval-mae:1.13127
[5]	train-mae:0.79404	eval-mae:0.79439
[6]	train-mae:0.56000	eval-mae:0.56055
[7]	train-mae:0.39862	eval-mae:0.39962
[8]	train-mae:0.28979	eval-mae:0.29115
[9]	train-mae:0.21904	eval-mae:0.22063
[10]	train-mae:0.17511	eval-mae:0.17684
[11]	train-mae:0.14903	eval-mae:0.15087
[12]	train-mae:0.13410	eval-mae:0.13598
[13]	train-mae:0.12526	eval-mae:0.12714
[14]	train-mae:0.11972	eval-mae:0.12168
[15]	train-mae:0.11510	eval-mae:0.11707
[16]	train-mae:0.11295	eval-mae:0.11492
[17]	train-mae:0.11159	eval-mae:0.11357
[18]	train-mae:0.11056	eval-mae:0.11257
[19]	train-mae:0.10975	eval-mae:0.11183
[20]	train-mae:0.10869	eval-mae:0.11084
[21]	train-mae:0.10796	eval-mae:0.11013
[22]	train-mae:0.10708	eval-mae:0.10925
[23]	train-mae:0.10668	eval-mae:0.10888
[24]	train-mae:0.106

[202]	train-mae:0.08186	eval-mae:0.08733
[203]	train-mae:0.08183	eval-mae:0.08732
[204]	train-mae:0.08177	eval-mae:0.08729
[205]	train-mae:0.08172	eval-mae:0.08725
[206]	train-mae:0.08168	eval-mae:0.08723
[207]	train-mae:0.08164	eval-mae:0.08722
[208]	train-mae:0.08155	eval-mae:0.08713
[209]	train-mae:0.08145	eval-mae:0.08705
[210]	train-mae:0.08138	eval-mae:0.08700
[211]	train-mae:0.08132	eval-mae:0.08696
[212]	train-mae:0.08124	eval-mae:0.08689
[213]	train-mae:0.08119	eval-mae:0.08685
[214]	train-mae:0.08116	eval-mae:0.08683
[215]	train-mae:0.08113	eval-mae:0.08681
[216]	train-mae:0.08108	eval-mae:0.08677
[217]	train-mae:0.08105	eval-mae:0.08676
[218]	train-mae:0.08101	eval-mae:0.08673
[219]	train-mae:0.08095	eval-mae:0.08669
[220]	train-mae:0.08092	eval-mae:0.08669
[221]	train-mae:0.08088	eval-mae:0.08667
[222]	train-mae:0.08084	eval-mae:0.08666
[223]	train-mae:0.08081	eval-mae:0.08662
[224]	train-mae:0.08080	eval-mae:0.08662
[225]	train-mae:0.08077	eval-mae:0.08661
[226]	train-mae:

[402]	train-mae:0.07541	eval-mae:0.08376
[403]	train-mae:0.07538	eval-mae:0.08374
[404]	train-mae:0.07536	eval-mae:0.08372
[405]	train-mae:0.07534	eval-mae:0.08372
[406]	train-mae:0.07533	eval-mae:0.08372
[407]	train-mae:0.07531	eval-mae:0.08371
[408]	train-mae:0.07530	eval-mae:0.08371
[409]	train-mae:0.07528	eval-mae:0.08371
[410]	train-mae:0.07525	eval-mae:0.08370
[411]	train-mae:0.07525	eval-mae:0.08370
[412]	train-mae:0.07523	eval-mae:0.08369
[413]	train-mae:0.07521	eval-mae:0.08369
[414]	train-mae:0.07518	eval-mae:0.08367
[415]	train-mae:0.07514	eval-mae:0.08365
[416]	train-mae:0.07511	eval-mae:0.08364
[417]	train-mae:0.07508	eval-mae:0.08363
[418]	train-mae:0.07505	eval-mae:0.08359
[419]	train-mae:0.07502	eval-mae:0.08357
[420]	train-mae:0.07500	eval-mae:0.08357
[421]	train-mae:0.07497	eval-mae:0.08356
[422]	train-mae:0.07495	eval-mae:0.08355
[423]	train-mae:0.07494	eval-mae:0.08355
[424]	train-mae:0.07492	eval-mae:0.08355
[425]	train-mae:0.07489	eval-mae:0.08354
[426]	train-mae:



accuracy: 0.13364180222275318
accuracy_ave:  0.13358439541166262


In [42]:
'''
6.作成した複数のモデルは問題ないか試しに一つのモデルで提出して精度を確認する
この時点での精度：0.0869
'''
def predict_and_submit(test_df, model):
    predict_X = xgb.DMatrix(test_df)
    y_pred = model.predict(predict_X)
    columns = sample_submission.columns
    for i in range(len(y_pred)):
        sample_submission[columns[1]][i] = y_pred[i]
    return sample_submission

sample_submission = predict_and_submit(test_df, xgb_models[0])
sample_submission.to_csv('submission_{}{}_{}{}.csv'.format(dt.month, dt.day, dt.hour, dt.minute), index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_submission[columns[1]][i] = y_pred[i]


In [59]:
'''
7.作成した複数のモデルの精度を確認してみる
'''
def check_models_accuracy(df_X, y, models):
    df_X = xgb.DMatrix(df_X, label = y)
    preds = []
    # 結果の検証 
    for model in models:
        preds.append(model.predict(df_X))
    #　各モデル個別の予測精度を平均二乗誤差で確認
    i = 0
    for pred in preds:
        print ("モデル{}の平均2乗誤差:{}".format(i, mean_squared_error(y, pred)))
        i += 1

In [60]:
check_models_accuracy(df_X, y, xgb_models)

モデル0の平均2乗誤差:0.0154910217766461
モデル1の平均2乗誤差:0.015475658233733057
モデル2の平均2乗誤差:0.015516780930203559
モデル3の平均2乗誤差:0.015595420723133042
モデル4の平均2乗誤差:0.015555447558814826


In [75]:
'''
8.作成した複数のモデルを組み合わせてみる
この時点での精度：0.0819<-0.0869
'''
def ensemble_models_xgboost(df_X, y, test_df, models):
    # スタッキングによる予測
    train_df = xgb.DMatrix(df_X)
    test_df = xgb.DMatrix(test_df)

    # 第1段階の予測値(この後、メタモデルの入力に使用)
    preds = []
    for model in models:
        preds.append(model.predict(train_df))

    #第1段階の予測値を積み重ねる
    stack_pred = np.column_stack(preds)

    # メタモデルの学習 
    meta_model = LinearRegression()
    meta_model.fit(stack_pred, y)
    
    # 各モデルの検証データを積み重ねる
    preds_eval = []
    for model in models:
        preds_eval.append(model.predict(test_df))
    stack_test_pred = np.column_stack(preds_eval)

    # スタッキングの検証
    meta_test_pred = meta_model.predict(stack_test_pred)
    return meta_test_pred

In [84]:
test_df[col] = meta_pred
test_df = test_df[['ID', col]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = meta_pred


In [91]:
dt = datetime.datetime.now()
test_df.to_csv('submission_{}{}_{}{}.csv'.format(dt.month, dt.day, dt.hour, dt.minute), index=False)