- competition/dataset : [https://www.kaggle.com/c/zillow-prize-1](https://www.kaggle.com/c/zillow-prize-1)
- date : 2021/03/01
- original : [https://www.kaggle.com/anokas/simple-xgboost-starter-0-0655](https://www.kaggle.com/anokas/simple-xgboost-starter-0-0655)

## Simple XGVoost Starter (~0.0655)

**✏ 필사 1회** 

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

import warnings
warnings.filterwarnings('ignore')

In [2]:
print('Loading data ...')
train = pd.read_csv('data/train_2016_v2.csv')
prop = pd.read_csv('data/properties_2016.csv')
sample = pd.read_csv('data/sample_submission.csv')

Loading data ...


In [3]:
train.head(3)

Unnamed: 0,parcelid,logerror,transactiondate
0,11016594,0.0276,2016-01-01
1,14366692,-0.1684,2016-01-01
2,12098116,-0.004,2016-01-01


In [4]:
prop.head(3)

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,


In [5]:
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

Binding to float32


In [6]:
print('Creating training set ...')
df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

Creating training set ...
(90275, 55) (90275,)


In [7]:
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train
gc.collect()

59

In [8]:
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

In [9]:
print('Building DMatrix ...')
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, y_valid
gc.collect()

Building DMatrix ...


55

In [10]:
print('Training ...')
params = {
    'eta':0.02,
    'objective':'reg:linear',
    'eval_metric':'mae',
    'max_depth':4,
    'silent':1
}

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

Training ...
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mae:0.48807	valid-mae:0.48112
[10]	train-mae:0.40222	valid-mae:0.39544
[20]	train-mae:0.33269	valid-mae:0.32610
[30]	train-mae:0.27652	valid-mae:0.27013
[40]	train-mae:0.23132	valid-mae:0.22521
[50]	train-mae:0.19506	valid-mae:0.18933
[60]	train-mae:0.16612	valid-mae:0.16073
[70]	train-mae:0.14312	valid-mae:0.13805
[80]	train-mae:0.12497	valid-mae:0.12024
[90]	train-mae:0.11078	valid-mae:0.10635
[100]	train-mae:0.09982	valid-mae:0.09570
[110]	train-mae:0.09145	valid-mae:0.08761
[120]	train-mae:0.08514	valid-mae:0.08160
[130]	train-mae:0.08045	valid-mae:0.07722
[140]	train-mae:0.07700	valid-mae:0.07408
[150]	train-mae:0.07450	valid-mae:0.07184
[160]	train-mae:0.07268	valid-mae:0.07025
[1

In [11]:
print('Building test set ...')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop
gc.collect()

Building test set ...


0

In [12]:
x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample
gc.collect()

22

In [13]:
d_test = xgb.DMatrix(x_test)

del x_test
gc.collect()

31

In [14]:
print('Predicting on test ...')
p_test = clf.predict(d_test)

del d_test
gc.collect()

Predicting on test ...


22

In [15]:
sub = pd.read_csv('data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

print('Writing csv ...')
sub.to_csv('data/submission_2_xgb_starter.csv', index=False, float_format='%.4f')

Writing csv ...
