In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook

macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [4]:
df_train = pd.read_csv("train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)

df_train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43.0,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34.0,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43.0,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89.0,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77.0,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [5]:
# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
#df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)

(38132, 290)


In [6]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)

In [7]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = X_all[:num_train]
X_train = X_all[:num_train-num_val]
X_val = X_all[num_train-num_val:num_train]
ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = X_all[num_train:]

df_columns = df_values.columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

(38132, 295)
('X_train_all shape is', (30470, 295))
('X_train shape is', (24376, 295))
('y_train shape is', (24376,))
('X_val shape is', (6094, 295))
('y_val shape is', (6094,))
('X_test shape is', (7662, 295))


In [8]:
X_train_all[np.isnan(X_train_all)]=-99999
ylog_train_all[np.isnan(ylog_train_all)]=-99999
X_train[np.isnan(X_train)]=-99999
ylog_train[np.isnan(ylog_train)]=-99999
X_val[np.isnan(X_val)]=-99999
ylog_val[np.isnan(ylog_val)]=-99999
X_test[np.isnan(X_test)]=-99999

In [11]:
dtrain_all = xgb.DMatrix(X_train_all, ylog_train_all,missing=-99999)
dtrain = xgb.DMatrix(X_train, ylog_train,missing=-99999)
dval = xgb.DMatrix(X_val, ylog_val,missing=-99999)
dtest = xgb.DMatrix(X_test,missing=-99999)

In [12]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                       early_stopping_rounds=30, verbose_eval=20)

num_boost_round = partial_model.best_iteration

Will train until val error hasn't decreased in 30 rounds.
[0]	val-rmse:14.476415
[1]	val-rmse:13.759510
[2]	val-rmse:13.077585
[3]	val-rmse:12.430661
[4]	val-rmse:11.816481
[5]	val-rmse:11.231199
[6]	val-rmse:10.676834
[7]	val-rmse:10.150159
[8]	val-rmse:9.649253
[9]	val-rmse:9.173671
[10]	val-rmse:8.722279
[11]	val-rmse:8.292276
[12]	val-rmse:7.883994
[13]	val-rmse:7.496890
[14]	val-rmse:7.128224
[15]	val-rmse:6.777260
[16]	val-rmse:6.445666
[17]	val-rmse:6.130590
[18]	val-rmse:5.831418
[19]	val-rmse:5.546637
[20]	val-rmse:5.276663
[21]	val-rmse:5.020744
[22]	val-rmse:4.776811
[23]	val-rmse:4.545947
[24]	val-rmse:4.325988
[25]	val-rmse:4.117352
[26]	val-rmse:3.919406
[27]	val-rmse:3.731042
[28]	val-rmse:3.552185
[29]	val-rmse:3.382589
[30]	val-rmse:3.220534
[31]	val-rmse:3.067304
[32]	val-rmse:2.922258
[33]	val-rmse:2.784370
[34]	val-rmse:2.652771
[35]	val-rmse:2.528780
[36]	val-rmse:2.411322
[37]	val-rmse:2.298965
[38]	val-rmse:2.192153
[39]	val-rmse:2.091488
[40]	val-rmse:1.996162
[

In [30]:
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round)

Unnamed: 0,sub_area,key,lat,lon,tolerance_m
0,Bibirevo,1.42239141:10.9185867:13.10061764:Bibirevo,55.891007,37.604844,5
1,Nagatinskij Zaton,9.503405157:3.103995954:6.444333466:Nagatinski...,55.676999,37.673135,5
2,Tekstil'shhiki,5.60479992:2.927487097:6.963402995:Tekstil'shhiki,55.702946,37.741159,10
3,Mitino,2.677824281:14.60650078:17.45719794:Mitino,55.84387,37.34892,10
4,Basmannoe,11.61665314:1.721833675:0.046809568:Basmannoe,55.763482,37.657421,10


In [11]:
ylog_pred = model.predict(dtest)
y_pred = np.exp(ylog_pred) - 1

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})

5     16672
10     7162
25     6152
50      484
Name: tolerance_m, dtype: int64

In [33]:
df_sub.to_csv('submission.csv', index=False)