# Simple XGBoost starter
https://www.kaggle.com/anokas/simple-xgboost-starter-0-0655

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

In [3]:
print('Loading data...')
train = pd.read_csv('./input/train_2016_v2.csv')
prop = pd.read_csv('./input/properties_2016.csv')
sample = pd.read_csv('./input/sample_submission.csv')

print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)
        
print('Creating training set...')
df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
    
del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

print('Training...')

params = {}
params['eta'] = 0.02
params['objective'] = "reg:linear"
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

print('Building test set...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop; gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
    
del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)
del x_test; gc.collect()

print("Predicting on test...")

p_test = clf.predict(d_test)

del d_test; gc.collect()

sub = pd.read_csv('./input/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c]: p_test

print('Writing csv...')
sub.to_csv('xgb_starter.csv', index=False, float_format='%.4f')

Loading data...


  interactivity=interactivity, compiler=compiler, result=result)


Binding to float32
Creating training set...
(90275, 55) (90275,)
Building DMatrix...
Training...
[0]	train-mae:0.488065	valid-mae:0.481119
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.402221	valid-mae:0.395444
[20]	train-mae:0.332679	valid-mae:0.326099
[30]	train-mae:0.276518	valid-mae:0.270132
[40]	train-mae:0.231316	valid-mae:0.225214
[50]	train-mae:0.195057	valid-mae:0.189328
[60]	train-mae:0.166122	valid-mae:0.160725
[70]	train-mae:0.143116	valid-mae:0.13805
[80]	train-mae:0.124973	valid-mae:0.120241
[90]	train-mae:0.110785	valid-mae:0.106352
[100]	train-mae:0.099816	valid-mae:0.095695
[110]	train-mae:0.091452	valid-mae:0.087611
[120]	train-mae:0.085136	valid-mae:0.081602
[130]	train-mae:0.080447	valid-mae:0.077224
[140]	train-mae:0.077012	valid-mae:0.074089
[150]	train-mae:0.074499	valid-mae:0.07184
[160]	train-mae:0.072682	valid-mae:0.070257
[170]	train-mae:0.071361

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Predicting on test...
Writing csv...
