In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
%matplotlib inline

In [1]:
import os

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-7.1.0-posix-seh-rt_v5-rev0\\mingw64\\bin'

os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import xgboost as xgb

In [3]:
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [54]:
train = pd.read_csv("cleaned_train.csv", parse_dates=['timestamp'],index_col='id') # index_col='id' if remove bad address, remove id
test = pd.read_csv("cleaned_test.csv", parse_dates=['timestamp'],index_col='id')
df_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)

train.head()

Unnamed: 0_level_0,timestamp,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,...,area_Jugoistočni,area_Jugozapadni,area_Južni,area_Novomoskovski,area_Sjeverni,area_Sjeveroistočni,area_Sjeverozapadni,area_Troicki,area_Zapadni,area_Zelenogradski
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2011-08-20,43.0,27.0,4.0,,,,,,9,...,0,0,0,0,0,1,0,0,0,0
2,2011-08-23,34.0,19.0,3.0,,,,,,70,...,0,0,1,0,0,0,0,0,0,0
3,2011-08-27,43.0,29.0,2.0,,,,,,129,...,1,0,0,0,0,0,0,0,0,0
4,2011-09-01,89.0,50.0,9.0,,,,,,65,...,0,0,0,0,0,0,1,0,0,0
5,2011-09-05,77.0,77.0,4.0,,,,,,6,...,0,0,0,0,0,0,0,0,0,0


In [55]:
#ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
ylog_train = np.log1p(train['price_doc'])

train_no_price = train.drop(['price_doc'], axis=1)


#Build df_all = (df_train+df_test).join(df_macro)
train = pd.merge_ordered(train, df_macro, on='timestamp', how='left')
test = pd.merge_ordered(test, df_macro, on='timestamp', how='left')

train.drop('timestamp', axis=1, inplace=True)
test.drop('timestamp', axis=1, inplace=True)

In [56]:
X_train, X_test,y_train, y_test = model_selection.train_test_split(train, ylog_train, test_size=0.4, random_state=0)

In [59]:
for df in [X_train, X_test, y_train, y_test, test]:
    df.fillna(-999999,inplace=True)

In [60]:
dtrain = xgb.DMatrix(X_train, y_train, missing=-999999)
dval = xgb.DMatrix(X_test, y_test,missing=-999999)
dtest = xgb.DMatrix(test,missing=-999999)

In [61]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                       early_stopping_rounds=20, verbose_eval=20)

num_boost_round = partial_model.best_iteration

[0]	val-rmse:14.3698
Will train until val-rmse hasn't improved in 20 rounds.
[20]	val-rmse:5.16039
[40]	val-rmse:1.85398
[60]	val-rmse:0.669443
[80]	val-rmse:0.24547
[100]	val-rmse:0.094274
[120]	val-rmse:0.044372
[140]	val-rmse:0.028491
[160]	val-rmse:0.023832
[180]	val-rmse:0.021298
[200]	val-rmse:0.019844
[220]	val-rmse:0.018763
[240]	val-rmse:0.018026
[260]	val-rmse:0.017573
[280]	val-rmse:0.017213
[300]	val-rmse:0.016786
[320]	val-rmse:0.016495
[340]	val-rmse:0.016272
[360]	val-rmse:0.016109
[380]	val-rmse:0.015977
[400]	val-rmse:0.015844
[420]	val-rmse:0.015714
[440]	val-rmse:0.015633
[460]	val-rmse:0.015539
[480]	val-rmse:0.015458
[500]	val-rmse:0.015374
[520]	val-rmse:0.015327
[540]	val-rmse:0.015262
[560]	val-rmse:0.015205
[580]	val-rmse:0.01515
[600]	val-rmse:0.015104
[620]	val-rmse:0.015071
[640]	val-rmse:0.015036
[660]	val-rmse:0.015001
[680]	val-rmse:0.014958
[700]	val-rmse:0.014914
[720]	val-rmse:0.014883
[740]	val-rmse:0.014855
[760]	val-rmse:0.014828
[780]	val-rmse:0.01