In [77]:
import os
import math
from typing import Callable

import pandas as pd
import numpy as np
from fklearn.training.transformation import onehot_categorizer, custom_transformer, count_categorizer
from fklearn.training.regression import xgb_regression_learner, rf_regression_learner
from fklearn.training.pipeline import build_pipeline
from fklearn.validation.evaluators import r2_evaluator, mse_evaluator, combined_evaluators
from fklearn.training.imputation import imputer

from sklearn.model_selection import train_test_split

from helpers import (display_all, to_snake_case, add_date_parts,
                     tranform_columns_to_categorical, separate_features_by_dtype,
                    to_normalized_string, draw_tree)

In [57]:
def print_metrics(df: pd.DataFrame, predict_fn: Callable):
    evaluator_fn = combined_evaluators(evaluators=[r2_evaluator(target_column="sale_price"),
                                                   mse_evaluator(target_column="sale_price")])
    df = predict_fn(df.copy())
    metrics = evaluator_fn(df)

    print(f"""
RMSE: {math.sqrt(metrics["mse_evaluator__sale_price"])}
R2: {metrics["r2_evaluator__sale_price"]}
    """)

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [5]:
DATA_PATH = "../data/bluebook-for-bulldozers/"

In [4]:
df_raw = pd.read_csv(f'{DATA_PATH}Train.csv', low_memory=False, parse_dates=["saledate"])

In [5]:
df_raw = df_raw.rename(mapper=to_snake_case, axis=1)

In [6]:
columns_dtype = separate_features_by_dtype(df_raw)
categorical_columns = columns_dtype["object"]

In [7]:
df_raw["sale_price"] = np.log(df_raw["sale_price"])
df_raw = add_date_parts(df_raw, drop=True)
df_raw[categorical_columns] = df_raw[categorical_columns].applymap(str).applymap(to_normalized_string)
df_raw = tranform_columns_to_categorical(df_raw, ordered={"usage_band": ["high", "medium", "low"]})

In [8]:
display_all(df_raw.head().T)

Unnamed: 0,0,1,2,3,4
sales_id,1139246,1139248,1139249,1139251,1139253
sale_price,11.0974,10.9508,9.21034,10.5584,9.30565
machine_id,999089,117657,434808,1026470,1057373
model_id,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneer_id,3,3,3,3,3
year_made,2004,1996,2001,2001,2007
machine_hours_current_meter,68,4640,2838,3486,722
usage_band,low,low,high,high,medium
fi_model_desc,521d,950fii,226,pc120_6e,s175


In [9]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/bulldozers-raw')

### Reload

In [40]:
df_raw = pd.read_feather('tmp/bulldozers-raw')
columns_dtype = separate_features_by_dtype(df_raw)

## Processors

### One-Hot Encoder

In [41]:
categorical_columns = columns_dtype["category"]
high_dimensional_columns = list(filter(lambda c: len(df_raw[c].unique()) > 100, categorical_columns))
categorical_columns = filter(lambda x: x not in high_dimensional_columns, categorical_columns)
categorical_columns = list(categorical_columns)

In [42]:
onehot_encoder_fn = onehot_categorizer(columns_to_categorize=categorical_columns,
                                       hardcode_nans=True,
                                       drop_first_column=True)

### Count encoder for high dimensional features

In [43]:
count_encoder_fn = count_categorizer(columns_to_categorize=high_dimensional_columns)

### Simple Continous Imputer

In [44]:
continuous_features = columns_dtype["float64"] + columns_dtype["int64"]
median_imputer_fn = imputer(columns_to_impute=continuous_features)

## Create Learners

### Feature Selection

In [45]:
banned_columns = ["sale_price", "sales_id"]

features = df_raw.columns
features = filter(lambda x: x not in banned_columns, features)
features = list(features)

### XGB Trainer

In [46]:
xgb_learner_fn = xgb_regression_learner(features=features,
                                        target="sale_price",
                                        encode_extra_cols=True)

### Random Forest Trainer

In [47]:
rf_learner_fn = rf_regression_learner(features=features,
                                      target="sale_price",
                                      encode_extra_cols=True,
                                      extra_params={"n_jobs": -1})

## Training

### Train vs Test

In [48]:
df_train, df_test = train_test_split(df_raw, test_size=12000, shuffle=True, random_state=111)

print(f"Train Shape: {df_train.shape}, Test Shape: {df_test.shape}")

Train Shape: (389125, 64), Test Shape: (12000, 64)


### Training XGB

In [58]:
xgb_learner = build_pipeline(onehot_encoder_fn, count_encoder_fn, xgb_learner_fn)
predict_fn, _, logs = xgb_learner(df_train)
print_metrics(df_test, predict_fn)


RMSE: 0.25491872149431766
R2: 0.8653347612819027
    


### Training RF

In [59]:
rf_learner = build_pipeline(onehot_encoder_fn, count_encoder_fn, median_imputer_fn, rf_learner_fn)
predict_fn, _, logs = rf_learner(df_train)
print_metrics(df_test, predict_fn)




RMSE: 0.215890169346216
R2: 0.903413200410449
    


## Single Tree

In [68]:
params = {
    "n_jobs": -1,
    "n_estimators": 1,
    "max_depth": 3,
    "bootstrap": False,
}

single_tree_rf_learner_fn = rf_regression_learner(features=features,
                                      target="sale_price",
                                      encode_extra_cols=True,
                                      extra_params=params)

rf_learner = build_pipeline(onehot_encoder_fn, count_encoder_fn, median_imputer_fn, single_tree_rf_learner_fn)
predict_fn, df_s, logs = rf_learner(df_train)
print_metrics(df_test, predict_fn)


RMSE: 0.5285679147680293
R2: 0.4210332327787686
    


In [78]:
model_object = logs["rf_regression_learner"]["model_object"]
variables = logs["rf_regression_learner"]["features"]
draw_tree(model_object.estimators_[0], df_s[variables], precision=3)

ExecutableNotFound: failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

<graphviz.files.Source at 0x7f37b777aa10>