#### An attempt to solve the bulldozers dataset in Kaggle using Random Forests

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#Importing requisite libraries
import pandas as pd,numpy as np,re,datetime,os,math,csv
from IPython.display import display
from sklearn.ensemble import RandomForestRegressor 
from sklearn import metrics
from concurrent.futures import ProcessPoolExecutor
import matplotlib.pyplot as plt
% matplotlib inline

from utils import *

##### PARAMETERS EXPLANATION
*Setting Low_memory to prevent memory crunch in guessing the datatype

*Parse date converts string date to a date format from (11/16/2006 0:00) to (2006-11-16)

In [None]:
# Reading the data
df_raw = pd.read_csv('data/train.csv',low_memory=False ,parse_dates=['saledate'])

In [None]:
#Checking the size of the dataset and the number of columns
print (df_raw.shape)
display(df_raw.describe(include='all').transpose())

In [None]:
#EVALUATION METRIC - RMSLE - root mean squared Log error
#Taking the log of the dependent variable
df_raw.SalePrice  = np.log(df_raw.SalePrice)                

In [None]:
add_datepart(df_raw,'saledate')

In [None]:
print (df_raw.head(5)['UsageBand'])

In [None]:
conv_to_cats(df_raw)
print (df_raw.head(5)['UsageBand'])

In [None]:
#setting the order for the categories
display(df_raw.UsageBand.cat.categories)


In [None]:
df_raw.UsageBand.cat.set_categories(['High','Medium','Low'],ordered = True,inplace = True)
df_raw.UsageBand = df_raw.UsageBand.cat.codes

In [None]:
print (df_raw.head(5)['UsageBand'])

In [None]:
#Percentage of values which are null
display(df_raw.isnull().sum().sort_index()/len(df_raw))

In [None]:
#Saving all the precproceesing work in a feather file for later access
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/bulldozers-raw')

#### START FROM HERE - PREPROCESSED

In [None]:
Working_df = pd.read_feather('tmp/bulldozers-raw')

In [None]:
#Splitting into numeric and non numeric data types
quantitative = [f for f in Working_df.columns if is_numeric_dtype(Working_df[f])]
qualitative = Working_df.columns.difference(quantitative)
#Do not replace and Id column and the dependent variable
quantitative.remove('SalesID')
quantitative.remove('SalePrice')


##### REPLACE MISSING VALUES OF NUMERIC DTYPE

In [None]:
for i in quantitative:
    fix_missing(Working_df,i)

##### REPLACE CATEGORICAL VALUES WITH THEIR CODES

In [None]:
for i in qualitative:
    numericalize(Working_df,i)
    #dropping the categorical column after replacing
    Working_df.drop(i,axis =1,inplace=True)

##### GETTING THE Y VALUE

In [None]:
y = Working_df.SalePrice

In [None]:
#calling the Random forest regressor
m = RandomForestRegressor(n_jobs=-1)
m.fit(Working_df, y)
m.score(Working_df, y)


#### Getting a sample of the total data for a better runtime

In [None]:
def get_Sample(df,n): return df.sample(n)
Working_df1 = get_Sample(Working_df,30000)
y1 = Working_df1['SalePrice']
Working_df1.drop('SalePrice',axis=1,inplace=True)


In [None]:
#extracting subset for processing
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

In [None]:
#Using splitvals for subsetting

n_valid = 12000  # same as Kaggle's test set size

n_trn = len(Working_df1)-n_valid
raw_train, raw_valid = split_vals(Working_df1, n_trn)
X_train, X_valid = split_vals(Working_df1, n_trn)
y_train, y_valid = split_vals(y1, n_trn)

X_train.shape, y_train.shape, X_valid.shape,y_valid.shape

#### RMSE

In [None]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())


In [None]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
#calling the Random forest regressor

m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train,y_train)
print_score(m)


### Trying Subsampling

In [None]:
validation = 12000
n_trn = len(Working_df)-12000
X_train, X_valid = split_vals(Working_df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

In [None]:
set_rf_samples(20000)
# reset_rf_samples()

In [None]:
#Base model with training and validation set
m = RandomForestRegressor(n_jobs=-1,n_estimators=40,max_features=0.5)
%time m.fit(X_train,y_train)
print_score(m)

In [None]:
b=np.stack([i.predict(X_valid) for i in m.estimators_])
#Default is 10 estimators and each estimator produces an array of n values(based on the sampling) (X_train)
#Average of the 10 estimators for each of them would be the output from the random forest model
#For the last record the random forest output would be
b[:,11999],np.mean(b[:,11999]),list(y_valid)[11999]


In [None]:
plt.plot([metrics.r2_score(y_valid, np.mean(b[:i+1], axis=0)) for i in range(10)]);

#### Using the Parallel core to speed up the output (ONLY ON UBUNTU)

In [None]:
%time preds = np.stack([t.predict(X_valid) for t in m.estimators_])
np.mean(preds[:,0]), np.std(preds[:,0])

In [None]:
def get_preds(t): return t.predict(X_valid)
def parallel_trees(m, fn, n_jobs=2):
        return list(ProcessPoolExecutor(n_jobs).map(fn, m.estimators_))
%time preds = np.stack(parallel_trees(m, get_preds))
np.mean(preds[:,0]), np.std(preds[:,0])

#### EVALUATING THE OUTPUT BY USING STD DEVIATION OF OUTPUT (confidence based on how it deviates)

In [None]:
preds.shape
x = raw_valid.copy()
x.columns




In [None]:
x['pred_std'] = np.std(preds, axis=0)
x['pred'] = np.mean(preds, axis=0)
x.ProductSize_num.value_counts().plot.barh();
#The product size has 6 categories with 1,2,5,6 having very less number of records hence the 
#std deviation is more and the predictions are not dpendable


In [None]:
flds = ['ProductSize_num', 'pred', 'pred_std']
summ = x[flds].groupby(flds[0]).mean()
summ

In [None]:
#Finding the features which are better and retaining only them.
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)
#Plotting the R^2 and the importances on a graph can show us the corelation and the values  of importances above which it matters
importances = rf_feat_importance(m,X_train)
importances.head(10)

In [None]:
importances.plot('cols', 'imp', figsize=(10,6), legend=True);

In [None]:
keeping = (importances[importances['imp'] > 0.005].sort_values('imp',ascending=False))
trunc_df = Working_df[keeping.cols]
trunc_df.shape

In [None]:
n_valid = 12000  # same as Kaggle's test set size
n_trn = len(trunc_df)-n_valid
raw_train, raw_valid = split_vals(trunc_df, n_trn)
X_train, X_valid = split_vals(trunc_df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape,y_valid.shape

In [None]:
##model with truncated features and max_features parameter
m = RandomForestRegressor(n_estimators =40,n_jobs=-1,max_features= 0.5)
m.fit(X_train,y_train)
print_score(m)

In [None]:
X_train.columns

In [None]:
X_train.drop(['saleDayofyear','saleWeek','saleDayofweek','saleDay'],axis = 1,inplace=True)
X_valid.drop(['saleDayofyear','saleWeek','saleDayofweek','saleDay'],axis = 1,inplace=True)

In [None]:
n = RandomForestRegressor(n_estimators =40,n_jobs=-1,max_features= 0.5)
n.fit(X_train,y_train)
print_score(n)

In [None]:
X_valid.shape

In [None]:
y_valid.shape

In [None]:
?? rmse