In [2]:
import os

import pandas as pd
import numpy as np
from fklearn.training.transformation import onehot_categorizer, custom_transformer
from fklearn.training.regression import xgb_regression_learner
from fklearn.training.pipeline import build_pipeline

from helpers import (display_all, to_snake_case, add_date_parts,
                     tranform_columns_to_categorical, separate_features_by_dtype,
                    to_normalized_string)

In [3]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [4]:
DATA_PATH = "../data/bluebook-for-bulldozers/"

In [21]:
df_raw = pd.read_csv(f'{DATA_PATH}Train.csv', low_memory=False, parse_dates=["saledate"])

In [22]:
df_raw = df_raw.rename(mapper=to_snake_case, axis=1)

In [24]:
columns_dtype = separate_features_by_dtype(df_raw)
categorical_columns = columns_dtype["object"]

In [26]:
df_raw["sale_price"] = np.log(df_raw["sale_price"])
df_raw = add_date_parts(df_raw, drop=True)
df_raw[categorical_columns] = df_raw[categorical_columns].applymap(str).applymap(to_normalized_string)
df_raw = tranform_columns_to_categorical(df_raw, ordered={"usage_band": ["high", "medium", "low"]})

In [30]:
display_all(df_raw.head().T)

Unnamed: 0,0,1,2,3,4
sales_id,1139246,1139248,1139249,1139251,1139253
sale_price,11.0974,10.9508,9.21034,10.5584,9.30565
machine_id,999089,117657,434808,1026470,1057373
model_id,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneer_id,3,3,3,3,3
year_made,2004,1996,2001,2001,2007
machine_hours_current_meter,68,4640,2838,3486,722
usage_band,low,low,high,high,medium
fi_model_desc,521d,950fii,226,pc120_6e,s175


In [31]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/bulldozers-raw')

### Reload

In [32]:
df_raw = pd.read_feather('tmp/bulldozers-raw')
columns_dtype = separate_features_by_dtype(df_raw)

### Defining categorical features to used

In [33]:
categorical_columns = columns_dtype["category"]
high_dimensional_columns = list(filter(lambda c: len(df_raw[c].unique()) > 100, categorical_columns))
categorical_columns = filter(lambda x: x not in high_dimensional_columns, categorical_columns)
categorical_columns = list(categorical_columns)

### One-Hot Encoder

In [34]:
onehot_encoder_fn = onehot_categorizer(columns_to_categorize=categorical_columns,
                                       hardcode_nans=True,
                                       drop_first_column=True)

### XGB Training

In [35]:
banned_columns = ["sale_price", "sales_id"] + high_dimensional_columns

features = df_raw.columns
features = filter(lambda x: x not in banned_columns, features)
features = list(features)

xgb_learner_fn = xgb_regression_learner(features=features,
                                        target="sale_price",
                                        encode_extra_cols=True)

In [None]:
%pdb

learner = build_pipeline(onehot_encoder_fn, xgb_learner_fn)
predict_fn, training_predictions, logs = learner(df_raw)

Automatic pdb calling has been turned ON


In [None]:
logs