In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [3]:
PATH = 'data/'
!ls {PATH}

Machine_Appendix.csv		  TrainAndValid.7z   Valid.7z
median_benchmark.csv		  TrainAndValid.csv  Valid.csv
random_forest_benchmark_test.csv  TrainAndValid.zip  ValidSolution.csv
Test.csv			  Train.csv	     Valid.zip
Train.7z			  Train.zip


In [4]:
df_raw = pd.read_csv(f'{PATH}Train.csv',low_memory=False, parse_dates=['saledate'])

In [5]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000):
        with pd.option_context('display.max_columns', 1000):
            display(df)

In [6]:
display_all(df_raw.tail().transpose())

Unnamed: 0,401120,401121,401122,401123,401124
SalesID,6333336,6333337,6333338,6333341,6333342
SalePrice,10500,11000,11500,9000,7750
MachineID,1840702,1830472,1887659,1903570,1926965
ModelID,21439,21439,21439,21435,21435
datasource,149,149,149,149,149
auctioneerID,1,1,1,2,2
YearMade,2005,2005,2005,2005,2005
MachineHoursCurrentMeter,,,,,
UsageBand,,,,,
saledate,2011-11-02 00:00:00,2011-11-02 00:00:00,2011-11-02 00:00:00,2011-10-25 00:00:00,2011-10-25 00:00:00


In [14]:
df_raw.SalePrice = np.log(df_raw.SalePrice) #convert sale price to log scale because compeition evaluation metrics is RMSLE

In [18]:
fld=df_raw.saledate

In [19]:
add_datepart(df_raw,'saledate')
df_raw.saleYear.head()

0    2006
1    2004
2    2004
3    2011
4    2009
Name: saleYear, dtype: int64

In [20]:
df_raw.columns

Index(['SalesID', 'SalePrice', 'MachineID', 'ModelID', 'datasource',
       'auctioneerID', 'YearMade', 'MachineHoursCurrentMeter', 'UsageBand',
       'fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc', 'fiModelSeries',
       'fiModelDescriptor', 'ProductSize', 'fiProductClassDesc', 'state',
       'ProductGroup', 'ProductGroupDesc', 'Drive_System', 'Enclosure',
       'Forks', 'Pad_Type', 'Ride_Control', 'Stick', 'Transmission',
       'Turbocharged', 'Blade_Extension', 'Blade_Width', 'Enclosure_Type',
       'Engine_Horsepower', 'Hydraulics', 'Pushblock', 'Ripper', 'Scarifier',
       'Tip_Control', 'Tire_Size', 'Coupler', 'Coupler_System',
       'Grouser_Tracks', 'Hydraulics_Flow', 'Track_Type',
       'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb', 'Pattern_Changer',
       'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type', 'Travel_Controls',
       'Differential_Type', 'Steering_Controls', 'saleYear', 'saleMonth',
       'saleWeek', 'saleDay', 'saleDayofweek', 'saleDayofyear',


In [21]:
train_cats(df_raw)

In [23]:
df_raw.UsageBand.cat.categories

Index(['High', 'Low', 'Medium'], dtype='object')

In [25]:
df_raw.UsageBand.cat.set_categories(['High','Medium','Low'], ordered=True, inplace=True)

In [26]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

Backhoe_Mounting            0.803872
Blade_Extension             0.937129
Blade_Type                  0.800977
Blade_Width                 0.937129
Coupler                     0.466620
Coupler_System              0.891660
Differential_Type           0.826959
Drive_System                0.739829
Enclosure                   0.000810
Enclosure_Type              0.937129
Engine_Horsepower           0.937129
Forks                       0.521154
Grouser_Tracks              0.891899
Grouser_Type                0.752813
Hydraulics                  0.200823
Hydraulics_Flow             0.891899
MachineHoursCurrentMeter    0.644089
MachineID                   0.000000
ModelID                     0.000000
Pad_Type                    0.802720
Pattern_Changer             0.752651
ProductGroup                0.000000
ProductGroupDesc            0.000000
ProductSize                 0.525460
Pushblock                   0.937129
Ride_Control                0.629527
Ripper                      0.740388
S

In [27]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/raw') #save data frame

In [7]:
df_raw = pd.read_feather('tmp/raw') #start here

In [8]:
df, y, nas = proc_df(df_raw, 'SalePrice')

In [35]:
proc_df

<function fastai.structured.proc_df>

In [9]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)

0.98300133375762211

In [12]:
#return copies of the array that can be modified without affecting the original array
def split_vals(a,n): return a[:n].copy(), a[n:].copy() 

n_valid = 12000 #same as Kaggle's test set size
n_trn = len(df) - n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)


X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((389125, 66), (389125,), (12000, 66), (12000,))

# Random Forest

In [15]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
          m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [16]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train) # %time shows the time it takes to complete a task
print_score(m)

CPU times: user 1min 29s, sys: 256 ms, total: 1min 29s
Wall time: 17.1 s
[0.09035634391679032, 0.2514534997355064, 0.98293712828376512, 0.88708195691921432]


In [19]:
# validation score 0.88 vs training score 0.98 shows that it is over fitting

# Speeding things up

In [28]:
#Using only a subset of the training data while keeping the same amount of validcation data as before for accuracy
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice', subset = 30000, na_dict = nas)
X_train, _ = split_vals(df_trn, 20000) # _ indicates throw-away variable
y_train, _ = split_vals(y_trn, 20000)

In [31]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 3.26 s, sys: 0 ns, total: 3.26 s
Wall time: 803 ms
[0.11384967096739136, 0.376533675193562, 0.97146222710635455, 0.74680471116689162]


# Bagging

In [32]:
#Out-of-bag score: allow us to see whether our model generalizes, even if we onl have a small amount of data
#so to avoid separating some out to create a validation set
#Use SubSampling to avoid overfitting while increase speed

In [34]:
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)

In [36]:
set_rf_samples(20000) #Instead of limiting the total amount of data accessible, use different random subset per tree

In [37]:
m = RandomForestRegressor(n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 9.29 s, sys: 360 ms, total: 9.65 s
Wall time: 4.22 s
[0.24027178794363396, 0.27513631724814736, 0.87934637176062702, 0.86481023480986985, 0.86674835040836429]


In [40]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True) # use more trees(estimators), default is 10
m.fit(X_train, y_train)
print_score(m)

[0.22673819946105653, 0.26315775305467587, 0.89255549406033574, 0.87632545732173273, 0.88105012784383518]


# Tree building parameters

In [42]:
#grow trees less deep
reset_rf_samples()
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.114960874512264, 0.23146148568192043, 0.97237928327880163, 0.90432346319917967, 0.90865912960234241]


In [46]:
#using different sets of features(columns) for each split in a tree
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, n_jobs=-1, max_features=0.5, oob_score=True) #use random 50% of the features on each split
m.fit(X_train, y_train)
print_score(m)

[0.11749354539031517, 0.22667831183062767, 0.97114886891890762, 0.90823693455806365, 0.91534506328514453]


In [48]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, n_jobs=-1, max_features='sqrt', oob_score=True) #try sqrt
m.fit(X_train, y_train)
print_score(m)

[0.15592055738556318, 0.25919467354605014, 0.94919089050193062, 0.88002241412338833, 0.89966038239495949]
