- competition/dataset : [https://www.kaggle.com/c/zillow-prize-1](https://www.kaggle.com/c/zillow-prize-1)
- date : 2021/03/02
- original : [https://www.kaggle.com/aharless/xgboost-lightgbm-and-ols-and-nn](https://www.kaggle.com/aharless/xgboost-lightgbm-and-ols-and-nn)

## XGBoost, LightGBM, and OLS and NN

**✏ 필사 1회** 

In [2]:
# Parameters
FUDGE_FACTOR = 1.1200  # 예측값에 곱함

XGB_WEIGHT = 0.6200
BASELINE_WEIGHT = 0.0100
OLS_WEIGHT = 0.0620
NN_WEIGHT = 0.0800
XGB1_WEIGHT = 0.8000  # 두 XGB 모델의 조합 중 첫 번째 모델의 WEIGHT

BASELINE_PRED = 0.0115

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt

from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

### Light GBM

In [4]:
# Read in raw data
print('Reading data from disk ...')
prop = pd.read_csv('data/properties_2016.csv')
train = pd.read_csv('data/train_2016_v2.csv')

Reading data from disk ...


In [5]:
# Process data for lightGBM
print('Processing data for LightGBM ...')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

df_train = train.merge(prop, on='parcelid', how='left')
df_train.fillna(df_train.median(), inplace=True)

x_train = df_train.drop(
    ['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc',
     'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1
)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)

Processing data for LightGBM ...
(90275, 53) (90275,)


In [6]:
# Run lightGBM
params = {
    'max_bin':10,
    'learning_rate':0.0021,
    'boosting_type':'gbdt',
    'objective':'regression',
    'metric':'l1',  # or 'mae'
    'sub_feature':0.345,
    'bagging_fraction':0.85,
    'bagging_freq':40,
    'num_leaves':512,
    'min_data':500,
    'min_hessian':0.05,
    'verbose':0,
    'feature_fraction_seed':2,
    'bagging_seed':3
}

np.random.seed(0)
random.seed(0)

print('Fitting LightGBM model ...')
clf = lgb.train(params, d_train, 430)

del d_train; gc.collect()
del x_train; gc.collect()

print('Prepare for LightGBM prediction ...')
print('  Read sample file ...')
sample = pd.read_csv('data/sample_submission.csv')
print('  ...')
sample['parcelid'] = sample['ParcelId']
print('  Merge with property data ...')
df_test = sample.merge(prop, on='parcelid', how='left')
print('  ...')
del sample, prop; gc.collect()
print('  ...')
x_test = df_test[train_columns]
print('  ...')
del df_test; gc.collect()
print('  Preparing x_test ...')
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
print('  ...')
x_test = x_test.values.astype(np.float32, copy=False)

print('Start LightGBM prediction ...')
p_test = clf.predict(x_test)
del x_test; gc.collect()

print('Unadjusted LightGBM predictions:')
print(pd.DataFrame(p_test).head())

Fitting LightGBM model ...
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Prepare for LightGBM prediction ...
  Read sample file ...
  ...
  Merge with property data ...
  ...
  ...
  ...
  Preparing x_test ...
  ...
Start LightGBM prediction ...
Unadjusted LightGBM predictions:
          0
0  0.031132
1  0.033375
2  0.010257
3  0.008651
4  0.009660


### XGBoost

In [7]:
# re-read properties file
print('Re-reading properties file ...')
properties = pd.read_csv('data/properties_2016.csv')

Re-reading properties file ...


In [8]:
# process data for xgboost
print('Processing data for XGBoost ...')
for c in properties.columns:
    properties[c].fillna(-1, inplace=True)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))

train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
print('Shape train: {}\nShape test: {}'.format(x_train.shape, y_train.shape))

Processing data for XGBoost ...
Shape train: (90275, 57)
Shape test: (90275,)


In [9]:
# drop out outliers
train_df = train_df[(train_df['logerror'] > -0.4)&(train_df['logerror'] < 0.419)]
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
y_train = train_df['logerror'].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))

After removing outliers:
Shape train: (88528, 57)
Shape test: (2985217, 57)


In [10]:
# run xgboost
print('Setting up data for XGBoost ...')
xgb_params = {
    'eta':0.037,
    'max_depth':5,
    'subsample':0.80,
    'objective':'reg:linear',
    'eval_metric':'mae',
    'lambda':0.8,   
    'alpha':0.4, 
    'base_score':y_mean,
    'silent':1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

num_boost_rounds = 250
print('num_boost_rounds=' + str(num_boost_rounds))

print('Training XGBoost ...')
model = xgb.train(xgb_params, dtrain, num_boost_rounds)

print('Predicting with XGBoost ...')
xgb_pred1 = model.predict(dtest)

print('First XGBoost predictions:')
print(pd.DataFrame(xgb_pred1).head())

Setting up data for XGBoost ...
num_boost_rounds=250
Training XGBoost ...
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Predicting with XGBoost ...
First XGBoost predictions:
          0
0 -0.030616
1 -0.028188
2  0.026397
3  0.063728
4  0.004398


In [11]:
# run XGBoost again
print('Setting up data for XGBoost')
xgb_params = {
    'eta':0.033,
    'max_depth':6,
    'subsample':0.80,
    'objective':'reg:linear',
    'eval_metric':'mae',
    'base_score':y_mean,
    'silent':1
}
num_boost_rounds = 150
print('num_boost_rounds=' + str(num_boost_rounds))

print('Training XGBoost again ...')
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)

print('Predicting with XGBoost again ...')
xgb_pred2 = model.predict(dtest)

print('Second XGBoost predicitions:')
print(pd.DataFrame(xgb_pred2).head())

Setting up data for XGBoost
num_boost_rounds=150
Training XGBoost again ...
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Predicting with XGBoost again ...
Second XGBoost predicitions:
          0
0 -0.091150
1 -0.034722
2  0.015816
3  0.075518
4  0.029908


In [12]:
# combine XGBoost results
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1 - XGB1_WEIGHT)*xgb_pred2

print('Combined XGBoost predictions:')
print(pd.DataFrame(xgb_pred).head())

del train_df, x_train, x_test, properties, dtest, dtrain, xgb_pred1, xgb_pred2
gc.collect()

Combined XGBoost predictions:
          0
0 -0.042723
1 -0.029495
2  0.024281
3  0.066086
4  0.009500


138

### Neural Network

In [None]:
# read in data for neural network
print('Processing data for Neural Network ...')
print('Loading train, prop and sample data ...')
train = pd.read_csv('data/train_2016_v2.csv', parse_dates=['transactiondate'])
prop = pd.read_csv('data/properties_2016.csv')
sample = pd.read_csv('data/sample_submission.csv')

print('Fitting Label Encoder on properties ...')
for c in prop.columns:
    prop[c].fillna(-1, inplace=True)
    if prop[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(prop[c].values))
        lbl.transform(list(prop[c].values))

print('Creating training set ...')
df_train = train.merge(prop, on='parcelid', how='left')

df_train['transactiondate'] = pd.to_datetime(df_train['transactiondate'])
df_train['transactiondate_year'] = df_train['transactiondate'].dt.year
df_train['transactiondate_month'] = df_train['transactiondate'].dt.month
df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
df_train['transactiondate'] = df_train['transactiondate'].dt.day

print('Filling NA/NaN values ...')
df_train.fillna(-1.0, inplace=True)

print('Creating x_train and y_train from df_train ...')
x_train = df_train.drop(
    ['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc',
     'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror']

y_mean = np.mean(y_train)
print(x_train.shape, y_train.shape)

Processing data for Neural Network ...
Loading train, prop and sample data ...
Fitting Label Encoder on properties ...


In [None]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = x_train[c] == True

print('Creating df_test ...')
sample['parcelid'] = sample['ParcelId']

print('Merging Sample with property data ...')
df_test = sample.merge(prop, on='parcelid', how='left')

df_test['transactiondate'] = pd.to_datetime('2016-11-15')
df_test['transactiondate_year'] = df_test['transactiondate'].dt.year
df_test['transactiondate_month'] = df_test['transactiondate'].dt.month
df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
df_test['transactiondate'] = df_test['transactiondate'].dt.day
x_test = df_test[train_columns]

print('Shape of x_test:', x_test.shape)
print('Preparing x_test ...')
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = x_test[c] == True

In [None]:
# preprocessing
print('Preprocessing neural network data ...')
imputer = SimpleImputer()
imputer.fit(x_train)
x_train = imputer.transform(x_train)
imputer.fit(x_test)
x_test = imputer.transform(x_test)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

len_x = int(x_train.shape[1])
print('len_x is:', len_x)

In [None]:
# Neural Network
print('Setting up neural network model ...')
nn = Sequential()
nn.add(Dense(units=400, kernel_initializer='normal', input_dim=len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units=160, kernel_initializer='normal'))
nn.add(PReLU())
nn.add(Dropout(.6))
nn.add(Dense(units=64, kernel_initializer='normal'))
nn.add(PReLU())
nn.add(Dropout(.5))
nn.add(Dense(units=26, kernel_initializer='normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

print('Fitting neural network model ...')
nn.fit(np.array(x_train), np.array(y_train), batch_size=32, epochs=70, verbose=2)

print('Predicting with neural network model ...')
y_pred_ann = nn.predict(x_test)

print('Preparing results for write ...')
nn_pred = y_pred_ann.flatten()
print('Type of nn_pred is', type(nn_pred))
print('Shape of nn_pred is', nn_pred.shape)

print('Neural Network predictions:')
print(pd.DataFrame(nn_pred).head())

del train, prop, sample, x_train, x_test, df_train, df_test, y_pred_add
gc.collect()

### OLS

In [None]:
np.random.seed(17)
random.seed(17)

print('Processing data for OLS ...')
train = pd.read_csv('data/train_2016_v2.csv', parse_dates=['transactiondate'])
prop = pd.read_csv('data/properties_2016.csv')
submission = pd.read_csv('data/sample_submission.csv')
print(len(train), len(properties), len(submission))

In [None]:
def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df.fillna(-1.0, inplace=True)
    return df

def MAE(y, ypred):
    return np.sum([abs(y[i] - ypred[i]) for i in range(len(y))]) / len(y)

In [None]:
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = []

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror', 'parcelid']
col = [c for c in train.columns if c not in exc]

train = get_features(train[col])
test['transactiondate'] = '2016-01-01'
test = get_features(test[col])

In [None]:
print('Fitting OLS ...')
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y)
print('fit ...')
train = []
y = []

test_dates = [
    '2016-10-01', '2016-11-01', '2016-12-01', '2017-10-01', '2017-11-01', '2017-12-01'
]
test_columns = ['201610', '201611', '201612', '201710', '201711', '201712']

### Combine Predictions

In [None]:
print('Combining XGBoost, LightGBM, NN, and baseline predictions ...')
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT - NN_WEIGHT - OLS_WEIGHT 
lgb_weight0 = lgb_weight / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
nn_weight0 = NN_WEIGHT / (1 - OLS_WEIGHT)
pred0 = 0
pred0 += xgb_weight0*xgb_pred
pred0 += baseline_weight0*BASELINE_PRED
pred0 += lgb_weight*p_test
pred0 += nn_weight0*nn_pred

print('Combined XGB/LGB/NN/baseline predictions:')
print(pd.DataFrame(pred0).head())

print('Predicting with OLS and combining with XGB/LGB/NN/baseline predictions: ...')
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = FUDGH_FACTOR * (OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0)
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict ...', i)

print('Combined XGB/LGB/NN/baseline/OLS predictions:')
print(submission.head())

### Write the Results

In [None]:
from datetime import datetime
print('Writing results to disk ...')
submission.to_csv('data/submission_4_sub{}.csv'.format(datetime.now().strftime(%Y%m%d_%H%M%S)), index=False)
print('Finished ...')