In [168]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
warnings.filterwarnings(action='ignore')

In [169]:
path='input/'
df_train=pd.read_csv(path+'train.csv',parse_dates=['timestamp'])
df_test=pd.read_csv(path+'test.csv',parse_dates=['timestamp'])
df_macro=pd.read_csv(path+'macro.csv')

In [170]:
y_train = df_train['price_doc'].values
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)

(38133, 389)


In [171]:
threshold=0.85
sel_cols_1=df_all.columns[df_all[:num_train].isnull().mean()<threshold].values
sel_cols_2=df_all[num_train:][sel_cols_1].isnull().mean()<threshold
sel_cols_2=sel_cols_2[sel_cols_2==1].index.values

In [172]:
df = df_all[sel_cols_2].select_dtypes(exclude=['object'])

In [173]:
bool_cols=['culture_objects_top_25',
       'thermal_power_plant_raion', 'incineration_raion',
       'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion',
       'big_market_raion', 'nuclear_reactor_raion',
       'detention_facility_raion', 'water_1line', 'big_road1_1line',
       'railroad_1line']
dummy_cols=['product_type', 'sub_area','ecology']

In [174]:
df_bools=df_all[bool_cols]
df_dummies=df_all[dummy_cols]

In [175]:
for item in dummy_cols:
    df=pd.concat([df,pd.get_dummies(df_dummies[item],prefix=item)],axis=1)
for item in bool_cols:
    df_bools[item] = pd.factorize(df_bools[item])[0]
df=pd.concat([df,df_bools],axis=1)

In [177]:
df.shape,df_test.shape,df_train.shape

((38133, 445), (7662, 290), (30471, 290))

In [178]:
X_train = df[:num_train]
X_test = df[num_train:]

In [179]:
y_train=np.log(y_train)

In [180]:
xgb_params = {
    'eta': 0.03,
    'max_depth': 5,
    'subsample': 0.45,
    'colsample_bytree': 0.55,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'nthread':4,
}

In [181]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test)

In [182]:
cv_result = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=1000, early_stopping_rounds=20,verbose_eval=20)

[0]	train-rmse:14.6686+0.00145341	test-rmse:14.6686+0.00599924
[20]	train-rmse:7.99088+0.00069398	test-rmse:7.9911+0.00594444
[40]	train-rmse:4.3676+0.000281163	test-rmse:4.36783+0.00553101
[60]	train-rmse:2.41114+0.000364092	test-rmse:2.41276+0.004923
[80]	train-rmse:1.36977+0.000608347	test-rmse:1.37392+0.00460517
[100]	train-rmse:0.839029+0.000960514	test-rmse:0.847606+0.00378347
[120]	train-rmse:0.593543+0.000918135	test-rmse:0.608031+0.00296154
[140]	train-rmse:0.494477+0.00108991	test-rmse:0.51443+0.00310962
[160]	train-rmse:0.457566+0.000940481	test-rmse:0.481946+0.00381219
[180]	train-rmse:0.443417+0.00107394	test-rmse:0.471045+0.00413396
[200]	train-rmse:0.436255+0.000850229	test-rmse:0.467168+0.00428193
[220]	train-rmse:0.431711+0.000799217	test-rmse:0.465556+0.00439584
[240]	train-rmse:0.427983+0.000823675	test-rmse:0.464554+0.00453208
[260]	train-rmse:0.424578+0.000656242	test-rmse:0.463923+0.00455773
[280]	train-rmse:0.421553+0.000668553	test-rmse:0.463525+0.00449754
[300]

In [None]:
clf=xgb.train(xgb_params,dtrain,num_boost_round=400)

In [154]:
pd.Series(clf.get_fscore())

(275,)

In [159]:
y_pred = np.exp(clf.predict(dtest))
df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})
df_sub.to_csv('res/xgb_4_29_v1.csv', index=False)

ValueError: array length 160194 does not match index length 7662

In [162]:
y_pred.shape,id_test.shape

((160194,), (7662,))

In [164]:
X_train.shape

(30471, 445)

In [166]:
df_train.shape,df_test.shape

((30471, 292), (7662, 291))