In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn
import statsmodels.api as sm
import scipy as sc
import xgboost as xgb



In [2]:
def fill_state(df):
    df_state = df[~pd.isnull(df.state) & ~pd.isnull(df.build_year)]
    df_state_grouped = df_state.groupby('build_year')
    maxs={}
    for name,group in df_state_grouped:
        grp = group.groupby('state').count()['full_sq']
        maxs[name] = np.argmax(grp)
    df_state_null = df[pd.isnull(df.state)]
    df_state_null.build_year = df_state_null.build_year.map(maxs)
    df_state_notnull = df[~pd.isnull(df.state)]
    df = pd.concat([df_state_notnull,df_state_null])
    #df = df.drop('latlon')
    return df

In [3]:
def load_data():
    df = pd.read_csv('train.csv')
    df.timestamp = pd.to_datetime(df.timestamp)
    df.lat = df.lat.round(3)
    df.lon = df.lon.round(3)
    df_test = pd.read_csv('test.csv',parse_dates=['timestamp'])
    df = fill_state(df)
    df_test = fill_state(df_test)
    return df,df_test

    

In [4]:
df_train,df_test = load_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [5]:
# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
#df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)

(38132, 292)


In [6]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj: 
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)

In [7]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = X_all[:num_train]
X_train = X_all[:num_train-num_val]
X_val = X_all[num_train-num_val:num_train]
ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = X_all[num_train:]

df_columns = df_values.columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

(38132, 297)
('X_train_all shape is', (30470, 297))
('X_train shape is', (24376, 297))
('y_train shape is', (24376,))
('X_val shape is', (6094, 297))
('y_val shape is', (6094,))
('X_test shape is', (7662, 297))


In [8]:
X_train_all[np.isnan(X_train_all)]=-99999
ylog_train_all[np.isnan(ylog_train_all)]=-99999
X_train[np.isnan(X_train)]=-99999
ylog_train[np.isnan(ylog_train)]=-99999
X_val[np.isnan(X_val)]=-99999
ylog_val[np.isnan(ylog_val)]=-99999
X_test[np.isnan(X_test)]=-99999

In [9]:
dtrain_all = xgb.DMatrix(X_train_all, ylog_train_all,missing=-99999)
dtrain = xgb.DMatrix(X_train, ylog_train,missing=-99999)
dval = xgb.DMatrix(X_val, ylog_val,missing=-99999)
dtest = xgb.DMatrix(X_test,missing=-99999)

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                       early_stopping_rounds=50, verbose_eval=20)

num_boost_round = partial_model.best_iteration

[0]	val-rmse:14.3322
Will train until val-rmse hasn't improved in 50 rounds.
[20]	val-rmse:5.16246
[40]	val-rmse:1.89725
[60]	val-rmse:0.765618
[80]	val-rmse:0.428083
[100]	val-rmse:0.355647
[120]	val-rmse:0.341687
[140]	val-rmse:0.338564
[160]	val-rmse:0.337223
[180]	val-rmse:0.336488
[200]	val-rmse:0.335958


In [None]:
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round)