In [1]:
import os

import pandas as pd
import numpy as np
from fklearn.training.transformation import onehot_categorizer

from helpers import display_all, to_snake_case, add_date_parts, tranform_columns_to_categorical, separate_features_by_dtype



In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
DATA_PATH = "../data/bluebook-for-bulldozers/"

In [33]:
df_raw = pd.read_csv(f'{DATA_PATH}Train.csv', low_memory=False, parse_dates=["saledate"])

In [34]:
df_raw = df_raw.rename(mapper=to_snake_case, axis=1)

In [35]:
df_raw.shape

(401125, 53)

In [36]:
display_all(df_raw.head().T)

Unnamed: 0,0,1,2,3,4
sales_id,1139246,1139248,1139249,1139251,1139253
sale_price,66000,57000,10000,38500,11000
machine_id,999089,117657,434808,1026470,1057373
model_id,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneer_id,3,3,3,3,3
year_made,2004,1996,2001,2001,2007
machine_hours_current_meter,68,4640,2838,3486,722
usage_band,Low,Low,High,High,Medium
saledate,2006-11-16 00:00:00,2004-03-26 00:00:00,2004-02-26 00:00:00,2011-05-19 00:00:00,2009-07-23 00:00:00


In [37]:
display_all(df_raw.describe(include="all").T)

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
sales_id,401125,,,,,,1919710.0,909021.0,1139250.0,1418370.0,1639420.0,2242710.0,6333340.0
sale_price,401125,,,,,,31099.7,23036.9,4750.0,14500.0,24000.0,40000.0,142000.0
machine_id,401125,,,,,,1217900.0,440992.0,0.0,1088700.0,1279490.0,1468070.0,2486330.0
model_id,401125,,,,,,6889.7,6221.78,28.0,3259.0,4604.0,8724.0,37198.0
datasource,401125,,,,,,134.666,8.96224,121.0,132.0,132.0,136.0,172.0
auctioneer_id,380989,,,,,,6.55604,16.9768,0.0,1.0,2.0,4.0,99.0
year_made,401125,,,,,,1899.16,291.797,1000.0,1985.0,1995.0,2000.0,2013.0
machine_hours_current_meter,142765,,,,,,3457.96,27590.3,0.0,0.0,0.0,3025.0,2483300.0
usage_band,69639,3.0,Medium,33985.0,,,,,,,,,
saledate,401125,3919.0,2009-02-16 00:00:00,1932.0,1989-01-17 00:00:00,2011-12-30 00:00:00,,,,,,,


In [38]:
df_raw["sale_price"] = np.log(df_raw["sale_price"])

In [54]:
df_raw = add_date_parts(df_raw, drop=True)

In [40]:
df_raw = tranform_columns_to_categorical(df_raw, ordered={"usage_band": ["High", "Medium", "Low"]})

In [45]:
df_raw.usage_band.cat.categories

Index(['High', 'Medium', 'Low'], dtype='object')

In [55]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/bulldozers-raw')

In [4]:
df_raw = pd.read_feather('tmp/bulldozers-raw')

In [58]:
display_all(df_raw.dtypes)

sales_id                          int64
sale_price                      float64
machine_id                        int64
model_id                          int64
datasource                        int64
auctioneer_id                   float64
year_made                         int64
machine_hours_current_meter     float64
usage_band                     category
fi_model_desc                  category
fi_base_model                  category
fi_secondary_desc              category
fi_model_series                category
fi_model_descriptor            category
product_size                   category
fi_product_class_desc          category
state                          category
product_group                  category
product_group_desc             category
drive_system                   category
enclosure                      category
forks                          category
pad_type                       category
ride_control                   category
stick                          category


In [12]:
columns_dtype = separate_features_by_dtype(df_raw)

In [13]:
onehot_fn = onehot_categorizer(columns_to_categorize=columns_dtype["category"])