In [2]:
%load_ext autoreload
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

import feather

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df_raw = pd.read_csv('./Train.csv', low_memory=False, parse_dates=['saledate'])
add_datepart(df_raw, 'saledate')
train_cats(df_raw)
df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)
df_raw.UsageBand = df_raw.UsageBand.cat.codes
df, y, nas = proc_df(df_raw, 'SalePrice')

In [5]:
def split_vals(a, n): return a[:n].copy(), a[n:].copy()

n_valid = 12000
n_train = len(df) - n_valid
X_train, X_valid = split_vals(df, n_train)
y_train, y_valid = split_vals(y, n_train)
raw_train, raw_valid = split_vals(df_raw, n_train)

In [6]:
x_sub = X_train[['YearMade', 'MachineHoursCurrentMeter']]

#### Defining our Random forest:
- Input : X [training data], y [Value to be predicted], no_of_estimators[trees], min_no_leaf[Number of values in leaf node], max_features [Fraction of features to be considered at each split], oob_score [Out-of-bag error], bootstrap [On/Off]
- Properties: r^2 score w/wo OOB score, predict score

In [13]:
class RForest():
    def __init__(self, X, y, no_of_estimators, sample_sz, min_no_leaf=1):
        self.X = X
        self.y = y
        self.min_no_leaf = min_no_leaf
        self.sample_sz = sample_sz
        self.estimators = [ self.create_estimator() for i in range(no_of_estimators) ]
        
    def create_estimator(self):
        indexes = np.random.permutation(len(self.y))[:self.sample_sz]
        return Estimator(self.X.iloc[indexes], self.y[indexes],
                         indexes = np.array(range(self.sample_sz)), min_no_leaf = self.min_no_leaf)
    
    def predict(self, X):
        preds = np.mean([e.predict(X) for e in self.estimators], axis = 0)
        return preds
 
def std_agg(count, s1, s2) : return math.sqrt((s2/count) - (s1/count)**2)

In [50]:
class Estimator():
    def __init__(self, X, y, indexes, min_no_leaf=5):
        self.X = X
        self.y = y
        self.min_no_leaf = min_no_leaf
        self.indexes = indexes
        self.no_rows = len(indexes)
        self.no_cols = X.shape[1]
        self.val = np.mean(y[indexes])
        self.score = float('inf')
        self.find_split()
        
    def find_split(self):
        for i in range(self.no_cols) : self.find_better_split(i)
        if self.score == float('inf'): return
        X = self.split_col
        lhs = np.nonzero(X<=self.split)[0]
        rhs = np.nonzero(X>self.split)[0]
        self.lhs = Estimator(self.X, self.y, self.indexes[lhs])
        self.rhs = Estimator(self.X, self.y, self.indexes[rhs])
        
    def find_better_split(self, var_idx):
        X, y = self.X.values[self.indexes, var_idx], self.y[self.indexes]
        sort_idx = np.argsort(X)
        sort_y, sort_x = y[sort_idx], X[sort_idx]
        lhs_cnt, lhs_sum, lhs_sum2 = 0,0.0,0.0
        rhs_cnt, rhs_sum, rhs_sum2 = self.no_rows, sort_y.sum(), (sort_y**2).sum()
        
        for i in range(0, self.no_rows-self.min_no_leaf):
            xi, yi = sort_x[i], sort_y[i]
            lhs_cnt += 1; rhs_cnt -= 1
            lhs_sum += yi; rhs_sum -= yi
            lhs_sum2 += yi**2; rhs_sum2 -= yi**2
            if i<self.min_no_leaf-1 or xi==sort_x[i+1]:
                continue
                
            lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)
            rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
            curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
            if curr_score<self.score: 
                self.var_idx,self.score,self.split = var_idx,curr_score,xi
        
    @property
    def split_name(self): return self.X.columns[self.var_idx]
    
    @property
    def split_col(self): return self.X.values[self.indexes,self.var_idx]

    @property
    def is_leaf(self): return self.score == float('inf')
    
    def __repr__(self):
        s = f'n: {self.n}; val:{self.val}'
        if not self.is_leaf:
            s += f'; score:{self.score}; split:{self.split}; var:{self.split_name}'
        return s

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
        return t.predict_row(xi)

In [26]:
cols = ['MachineID', 'YearMade', 'MachineHoursCurrentMeter', 'ProductSize', 'Enclosure',
        'Coupler_System', 'saleYear']

In [55]:
ens = RForest(X_train[cols], y_train, 5, 1000)

In [56]:
preds = ens.predict(X_valid[cols].values)

In [57]:
metrics.r2_score(y_valid, preds)

0.5725551075540652