In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score as cv_score
from sklearn.decomposition import PCA, FastICA
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics 
from sklearn.ensemble import RandomForestRegressor
import datetime
import copy
%matplotlib inline

In [42]:
def error(y_pred, y_true):
    y_pred = np.log1p(y_pred)
    y_true = np.log1p(y_true)
    err0_ = ((y_pred - y_true) ** 2).mean() ** (0.5)
    return np.array([err0_])

# Read & Prepare

In [43]:
train = pd.read_csv('train.csv', parse_dates=['timestamp'])
test = pd.read_csv('test.csv', parse_dates=['timestamp'])
macro = pd.read_csv('macro.csv', parse_dates=['timestamp'])
id_test = test.id


#clear
bad_index = train[train.life_sq > train.full_sq].index
train.loc[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.loc[equal_index, "life_sq"] = test.loc[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.loc[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.loc[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.loc[kitch_is_build_year, "build_year"] = train.loc[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.loc[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.loc[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.loc[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index
train.loc[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index
test.loc[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713, 29172]
train.loc[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.loc[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.loc[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.loc[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.loc[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.loc[bad_index, "max_floor"] = np.NaN
bad_index = [23584]
train.loc[bad_index, "floor"] = np.NaN
train.material.value_counts()
test.material.value_counts()
train.state.value_counts()
bad_index = train[train.state == 33].index
train.loc[bad_index, "state"] = np.NaN
test.state.value_counts()

train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 10000]

In [44]:
# time features
month_year = (train['timestamp'].dt.month + train['timestamp'].dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)

month_year = (test['timestamp'].dt.month + test['timestamp'].dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)

week_year = (train['timestamp'].dt.weekofyear + train['timestamp'].dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)

week_year = (test['timestamp'].dt.weekofyear + test['timestamp'].dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)

train['month'] = train['timestamp'].dt.month
train['dow'] = train['timestamp'].dt.dayofweek

test['month'] = test['timestamp'].dt.month
test['dow'] = test['timestamp'].dt.dayofweek

# some usefull features
train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)

test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)

train['apartment_name']=train.sub_area + train['metro_km_avto'].astype(str)
test['apartment_name'] =test.sub_area + train['metro_km_avto'].astype(str)

train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

In [45]:
x_train = train.drop(["id", "timestamp", "price_doc"], axis=1)
y_train = train["price_doc"]
x_test = test.drop(["id", "timestamp"], axis=1)

num_train = len(x_train)
x_all = pd.concat([x_train, x_test])

for c in x_all.columns:
    if x_all[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_all[c].values))
        x_all[c] = lbl.transform(list(x_all[c].values))

x_train = x_all[:num_train]
x_test = x_all[num_train:]

In [46]:
x_train.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,sport_count_5000,market_count_5000,month_year_cnt,week_year_cnt,month,dow,rel_floor,rel_kitch_sq,apartment_name,room_size
0,43.0,27.0,4.0,,,,,,,0,...,52,4,3,1,8,5,,,873,
1,34.0,19.0,3.0,,,,,,,0,...,66,14,3,2,8,1,,,9179,
2,43.0,29.0,2.0,,,,,,,0,...,67,10,3,2,8,5,,,16475,
3,89.0,50.0,9.0,,,,,,,0,...,26,3,39,1,9,3,,,8583,
4,77.0,77.0,4.0,,,,,,,0,...,195,14,39,5,9,0,,,590,


In [47]:
x_test.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,sport_count_5000,market_count_5000,month_year_cnt,week_year_cnt,month,dow,rel_floor,rel_kitch_sq,apartment_name,room_size
0,39.0,20.7,2.0,9.0,1.0,1998.0,1.0,8.9,3.0,0,...,14,1,396,59,7,2,0.222222,0.228205,5066,20.7
1,79.2,,8.0,17.0,1.0,,3.0,,1.0,1,...,12,1,396,59,7,2,0.470588,,12889,
2,40.5,25.1,3.0,5.0,2.0,1960.0,2.0,4.8,2.0,0,...,71,11,396,59,7,2,0.6,0.118519,11443,12.55
3,62.8,36.0,17.0,17.0,1.0,2016.0,2.0,,3.0,1,...,2,0,396,59,7,2,1.0,,13380,18.0
4,40.0,40.0,17.0,17.0,1.0,,1.0,,1.0,1,...,11,1,396,59,7,2,1.0,,12931,40.0


# XGBoost

In [48]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

In [49]:
dtrain = xgb.DMatrix(x_train, (y_train))
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=True)

[0]	train-rmse:8.18112e+06+24020.7	test-rmse:8.18874e+06+52006.2
[10]	train-rmse:5.36485e+06+16101.6	test-rmse:5.44402e+06+47780
[20]	train-rmse:3.78827e+06+12355.6	test-rmse:3.96189e+06+53371.3
[30]	train-rmse:2.93797e+06+2181.65	test-rmse:3.21812e+06+66801.6
[40]	train-rmse:2.49423e+06+1602.6	test-rmse:2.87221e+06+63442.2
[50]	train-rmse:2.265e+06+940.872	test-rmse:2.71277e+06+56696
[60]	train-rmse:2.14339e+06+3111.33	test-rmse:2.64354e+06+52480
[70]	train-rmse:2.06988e+06+3342.54	test-rmse:2.6044e+06+47288.9
[80]	train-rmse:2.01903e+06+5661.87	test-rmse:2.58184e+06+45239.4
[90]	train-rmse:1.98293e+06+5790.79	test-rmse:2.56681e+06+40755.6
[100]	train-rmse:1.95145e+06+3050.57	test-rmse:2.55663e+06+37876.9
[110]	train-rmse:1.92296e+06+5979.09	test-rmse:2.54863e+06+38244.7
[120]	train-rmse:1.89617e+06+11140.3	test-rmse:2.54147e+06+35830.3
[130]	train-rmse:1.8729e+06+9278.52	test-rmse:2.53564e+06+34694.4
[140]	train-rmse:1.84828e+06+7606.47	test-rmse:2.53009e+06+33487.1
[150]	train-rmse:

In [50]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 330
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

y_predict_xgb = model.predict(dtest)

In [51]:
output = pd.DataFrame({'id': id_test, 'price_doc': y_predict_xgb})
output.to_csv('model1_xgb.csv', index=None)
output.head()

Unnamed: 0,id,price_doc
0,30474,5587276.0
1,30475,8306189.5
2,30476,5483343.5
3,30477,5892326.5
4,30478,4888475.0


# LightGBM

In [52]:
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.04,
    'verbose': 10,
    'num_leaves': 32,
    'bagging_fraction': 0.99,
    'bagging_freq': 10,
    'feature_fraction': 0.7,
    'max_bin': 500,
    'max_depth': 10,
}

In [53]:
ltrain = lgb.Dataset(x_train, (y_train)/1e6)
cv_output = lgb.cv(lgb_params, ltrain, num_boost_round=1000, nfold = 3, early_stopping_rounds=50,
    verbose_eval=50, show_stdv=True)

[50]	cv_agg's rmse: 2.77422 + 0.0262748
[100]	cv_agg's rmse: 2.58877 + 0.0238566
[150]	cv_agg's rmse: 2.53969 + 0.0224513
[200]	cv_agg's rmse: 2.51585 + 0.0232622
[250]	cv_agg's rmse: 2.50514 + 0.0223479
[300]	cv_agg's rmse: 2.49849 + 0.0202024
[350]	cv_agg's rmse: 2.49498 + 0.0195224
[400]	cv_agg's rmse: 2.49417 + 0.0193251
[450]	cv_agg's rmse: 2.49307 + 0.0200161
[500]	cv_agg's rmse: 2.49234 + 0.0191


In [54]:
ltrain = lgb.Dataset(x_train, y_train/1e6)
num_boost_round = 540

model = lgb.train(lgb_params, ltrain, num_boost_round=num_boost_round)

y_predict_lgb = model.predict(x_test)*1e6

In [55]:
output_lgb = pd.DataFrame({'id': id_test, 'price_doc': y_predict_lgb})
output_lgb.to_csv('model1_lgb.csv', index=None)
output_lgb.head()

Unnamed: 0,id,price_doc
0,30474,5391550.0
1,30475,8274639.0
2,30476,5323021.0
3,30477,5769113.0
4,30478,4884380.0


# RandomForest

In [56]:
rf = RandomForestRegressor(n_estimators=200, max_features=120, n_jobs=-1, random_state=128)

x_train_rf = x_train.fillna(x_train.median())
x_test_rf  = x_test.fillna(x_train.median())

rf.fit(x_train_rf, y_train)

y_predict_rf = rf.predict(x_test_rf)

In [57]:
output_rf = pd.DataFrame({'id': id_test, 'price_doc': y_predict_rf})
output_rf.to_csv('model1_rfr.csv', index=None)
output_rf.head()

Unnamed: 0,id,price_doc
0,30474,5330462.82
1,30475,8408136.49
2,30476,5246848.5
3,30477,5942082.89
4,30478,5001212.995


# Combine models

In [58]:
y_res = 0.4*y_predict_xgb + 0.4*y_predict_lgb + 0.2*y_predict_rf
output_rf = pd.DataFrame({'id': id_test, 'price_doc': y_res})
output_rf.to_csv('model1_res.csv', index=None)
output_rf.head()

Unnamed: 0,id,price_doc
0,30474,5457623.0
1,30475,8313959.0
2,30476,5371915.0
3,30477,5852993.0
4,30478,4909385.0


In [None]:
#  private lb  ~ 0.31539 
#  public  lb  ~ 0.31319
#  ~ 110 place in lb