In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import random
import datetime as dt
from sklearn.metrics import mean_absolute_error

features = pd.read_csv('data/train_features.csv')
labels = pd.read_csv('data/train_label.csv')

<h3> Preproccessing </h3>

Preprocess our input to make it compatible with XGBoost. We need to replace NaN's 
and encode categorical features. Since there are multiple classes for most of them d
dummy variables would exponentially increase the feature space, we use label encoding instead.

We also engineer some features, such as the month extraction from the transaction date field.

In [None]:
def drop_unchecked(df, cols):   
    """
    An unchecked version of pandas.DataFrame.drop(cols, axis=1). This will not raise 
    an error in case of non existing column. Be careful though as this might hide spelling errors
    """
    for col in (set(cols) & set(df.columns)):
        df = df.drop([col], axis=1)
    return df

# Enable OOP usage: df.drop_unchecked(cols) instead of drop_unchecked(df, cols)
pd.DataFrame.drop_unchecked = drop_unchecked

for c in features.columns:
    # Replace NaNs
    features[c] = features[c].fillna(-1)
    if features[c].dtype == 'object':
        # Encode categorical features
        lbl = LabelEncoder()
        lbl.fit(list(features[c].values))
        features[c] = lbl.transform(list(features[c].values))

# Drop some useless or extremely rare features
features = features.drop_unchecked(['propertyzoningdesc', 'propertycountylandusecode',
                                    'fireplacecnt', 'fireplaceflag'])

# Create additional features. Month will be used to split into training and validation
labels["transactiondate"] = pd.to_datetime(labels["transactiondate"])
labels["Month"] = labels["transactiondate"].dt.month

** Now we are ready to merge out features with their labels. We only keep properties for which the label is known **

In [None]:
train = labels.merge(features, how='left', on='parcelid')
train.head()

<h3> Split the dataset into training and validation </h3>


In order to reliably evaluate a model we need to eliminate the effect of overfitting. That is achieved by
evaluating our model on data that were not used during training. For this reason we split the data into
training (first 9 months of transactions) and validation (last 3 months). This is roughly equivalent to a 75 - 25 split.

In [None]:
# ID and date are no longer usefull
train = train.drop_unchecked(['parcelid', 'transactiondate'])

# Split into training and validation sets
x_train = train[train["Month"] < 10]                       
x_val   = train[train["Month"] >= 10]  

# Drop some outliers. This is debatable; arguably records with extraordinary log error are the most interesting
x_train = x_train.query('logerror > -0.4 and logerror < 0.4')

y_train = x_train['logerror'].values
y_val = x_val['logerror'].values

x_train = x_train.drop_unchecked(['logerror'])
x_val = x_val.drop_unchecked(['logerror'])
   
print('Shape train: {}\nShape validation: {}'.format(x_train.shape, x_val.shape))       

<h3> Lets train the model! </h3>

Our input is now ready to train a model! The only thing missing is tuning the models hyper-parameters.
We start by reusing parameters open sourced on kaggle - we should later use a more thorough approach such as gridsearch.

In [None]:
# Our base prediction will be the number optimizing for MAE, that is the median
y_mean = np.median(y_train)

##### RUN XGBOOST
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}
num_boost_rounds = 500

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_val)




# Train the model. Expect your machine to overheat here.
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost ...")
prediction = model.predict(dtest)

mae = mean_absolute_error(y_val, prediction)

print("\n##########")
print("Mean Absolute Error is: ", mae)
print("##########")

<h3> These results look promising! </h3>

On the first try we achieve a **MAE of 0.06491**. This is already good enough to put us in the top 50% of competitors. It is obvious that XGBoost has a lot to offer, careful tuning and ensembling should probably be our next steps.