In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

print('Loading data ...')

train = pd.read_csv('data/train_2016_v2.csv')
prop = pd.read_csv('data/properties_2016.csv')
sample = pd.read_csv('data/sample_submission.csv')

print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
	if dtype == np.float64:
		prop[c] = prop[c].astype(np.float32)

print('Creating training set ...')

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

#del df_train; gc.collect()

# random split?
split = 80000
###############

x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

#del x_train, x_valid; gc.collect()

print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

#del d_train, d_valid

print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

#del prop; gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

#del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)

#del x_test; gc.collect()

print('Predicting on test ...')

p_test = clf.predict(d_test)

print('Prediction completed')

#del d_test; gc.collect()

# sub = pd.read_csv('data/sample_submission.csv')
# for c in sub.columns[sub.columns != 'ParcelId']:
#     sub[c] = p_test

Loading data ...


  interactivity=interactivity, compiler=compiler, result=result)


Binding to float32
Creating training set ...
(90275, 55) (90275,)
Building DMatrix...
Training ...
[0]	train-mae:0.488065	valid-mae:0.48112
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.402221	valid-mae:0.395444
[20]	train-mae:0.33268	valid-mae:0.326099
[30]	train-mae:0.276518	valid-mae:0.270132
[40]	train-mae:0.231316	valid-mae:0.225213
[50]	train-mae:0.195059	valid-mae:0.189317
[60]	train-mae:0.166121	valid-mae:0.16072
[70]	train-mae:0.143116	valid-mae:0.138042
[80]	train-mae:0.124973	valid-mae:0.120214
[90]	train-mae:0.11079	valid-mae:0.106351
[100]	train-mae:0.099822	valid-mae:0.095702
[110]	train-mae:0.091454	valid-mae:0.087592
[120]	train-mae:0.085149	valid-mae:0.08158
[130]	train-mae:0.080457	valid-mae:0.077192
[140]	train-mae:0.077015	valid-mae:0.074063
[150]	train-mae:0.07451	valid-mae:0.071827
[160]	train-mae:0.072688	valid-mae:0.070245
[170]	train-mae:0.071374	v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Predicting on test ...
Prediction completed


In [11]:
x_train

Unnamed: 0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,1.0,,,2.0,3.0,,4.0,2.0,,,...,,False,122754.0,360170.0,2015.0,237416.0,6735.879883,False,,6.037107e+13
1,,,,3.5,4.0,,,3.5,,,...,,False,346458.0,585529.0,2015.0,239071.0,10153.019531,False,,
2,1.0,,,3.0,2.0,,4.0,3.0,,,...,,False,61994.0,119906.0,2015.0,57912.0,11484.480469,False,,6.037464e+13
3,1.0,,,2.0,2.0,,4.0,2.0,,,...,,False,171518.0,244880.0,2015.0,73362.0,3048.739990,False,,6.037296e+13
4,,,,2.5,4.0,,,2.5,,,...,2.0,False,169574.0,434551.0,2015.0,264977.0,5488.959961,False,,6.059042e+13
5,1.0,,,4.0,4.0,,1.0,4.0,,,...,,False,880650.0,2447951.0,2015.0,1567301.0,27126.570312,False,,6.037621e+13
6,,,,1.0,2.0,,7.0,1.0,,,...,,False,64549.0,111521.0,2015.0,46972.0,2304.969971,False,,6.037542e+13
7,,,,2.5,3.0,,,2.5,,853.0,...,2.0,False,107000.0,306000.0,2015.0,199000.0,3745.500000,False,,6.111003e+13
8,,,,1.0,2.0,,,1.0,,,...,,False,66834.0,210064.0,2015.0,143230.0,2172.879883,False,,6.059042e+13
9,,,,2.0,2.0,,,2.0,,,...,1.0,False,109977.0,190960.0,2015.0,80983.0,1940.260010,False,,6.059063e+13


In [13]:
len(train)

90275