#### An attempt to solve the bulldozers dataset in Kaggle using Random Forests

In [119]:
#Importing requisite libraries
import pandas as pd,numpy as np,re,datetime,os,math
from IPython.display import display
from pandas.api.types import  is_string_dtype,is_numeric_dtype
from sklearn.ensemble import RandomForestRegressor 

##### PARAMETERS EXPLANATION
*Setting Low_memory to prevent memory crunch in guessing the datatype

*Parse date converts string date to a date format from (11/16/2006 0:00) to (2006-11-16)

In [None]:
# Reading the data
df_raw = pd.read_csv('data/train.csv',low_memory=False ,parse_dates=['saledate'])

In [None]:
#Checking the size of the dataset and the number of columns
print (df_raw.shape)
display(df_raw.describe(include='all').transpose())

In [None]:
#EVALUATION METRIC - RMSLE - root mean squared Log error
#Taking the log of the dependent variable
df_raw.SalePrice  = np.log(df_raw.SalePrice)                

In [None]:
#Converting the date field to various fields <saledate >
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):  
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)


In [None]:
add_datepart(df_raw,'saledate')

In [None]:
#Converting categorical variables to numbers
def conv_to_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [None]:
conv_to_cats(df_raw)

In [None]:
#setting the order for the categories
display(df_raw.UsageBand.cat.categories)
df_raw.UsageBand.cat.set_categories(['High','Medium','Low'],ordered = True,inplace = True)
df_raw.UsageBand = df_raw.UsageBand.cat.codes

In [None]:
#Percentage of values which are null
display(df_raw.isnull().sum().sort_index()/len(df_raw))

In [None]:
#Saving all the precproceesing work in a feather file for later access

os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/bulldozers-raw')

#### START FROM HERE - PREPROCESSED

In [104]:
Working_df = pd.read_feather('tmp/bulldozers-raw')

In [105]:
#Splitting into numeric and non numeric data types
quantitative = [f for f in Working_df.columns if is_numeric_dtype(Working_df[f])]
qualitative = Working_df.columns.difference(quantitative)
#Do not replace and Id column and the dependent variable
quantitative.remove('SalesID')
quantitative.remove('SalePrice')


##### REPLACE MISSING VALUES OF NUMERIC DTYPE

In [106]:
#Replace missing values
def fix_missing(df,col):
    if is_numeric_dtype(df[col]):
        df[col+'_na']= pd.isnull(df[col])
        df[col]= df[col].fillna(df[col].median())

In [107]:
for i in quantitative:
    fix_missing(Working_df,i)

##### REPLACE CATEGORICAL VALUES WITH THEIR CODES

In [108]:
#Replace Categorical variable with their codes
def numericalize(df,col):
    if not is_numeric_dtype(df[col]):
        df[col+'_num']= df[col].cat.codes+1

In [109]:
for i in qualitative:
    numericalize(Working_df,i)
    #dropping the categorical column after replacing
    Working_df.drop(i,axis =1,inplace=True)

##### GETTING THE Y VALUE

In [110]:
y = Working_df['SalePrice']
Working_df.drop('SalePrice',axis=1,inplace=True)


In [114]:
#calling the Random forest regressor
m = RandomForestRegressor(n_jobs=-1)
m.fit(Working_df, y)
m.score(Working_df, y)


0.982999172622537

In [117]:
#extracting subset for processing
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 12000  # same as Kaggle's test set size
n_trn = len(Working_df)-n_valid
raw_train, raw_valid = split_vals(Working_df, n_trn)
X_train, X_valid = split_vals(Working_df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((389125, 84), (389125,), (12000, 84))

#### RMSE

In [124]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())


In [None]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [122]:
#Base model with training and validation set
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train,y_train)
print_score(m)

NameError: name 'score' is not defined