#### An attempt to solve the bulldozers dataset in Kaggle using Random Forests

In [26]:
#Importing requisite libraries
import pandas as pd,numpy as np,re,datetime,os,math,csv
from IPython.display import display
from pandas.api.types import  is_string_dtype,is_numeric_dtype
from sklearn.ensemble import RandomForestRegressor 
import matplotlib.pyplot as plt

##### PARAMETERS EXPLANATION
*Setting Low_memory to prevent memory crunch in guessing the datatype

*Parse date converts string date to a date format from (11/16/2006 0:00) to (2006-11-16)

In [None]:
# Reading the data
df_raw = pd.read_csv('data/train.csv',low_memory=False ,parse_dates=['saledate'])

In [None]:
#Checking the size of the dataset and the number of columns
print (df_raw.shape)
display(df_raw.describe(include='all').transpose())

In [None]:
#EVALUATION METRIC - RMSLE - root mean squared Log error
#Taking the log of the dependent variable
df_raw.SalePrice  = np.log(df_raw.SalePrice)                

In [16]:
#Converting the date field to various fields <saledate >
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):  
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)


In [None]:
add_datepart(df_raw,'saledate')

In [19]:
#Converting categorical variables to numbers
def conv_to_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [None]:
conv_to_cats(df_raw)

In [None]:
#setting the order for the categories
display(df_raw.UsageBand.cat.categories)
df_raw.UsageBand.cat.set_categories(['High','Medium','Low'],ordered = True,inplace = True)
df_raw.UsageBand = df_raw.UsageBand.cat.codes

In [None]:
#Percentage of values which are null
display(df_raw.isnull().sum().sort_index()/len(df_raw))

In [None]:
#Saving all the precproceesing work in a feather file for later access

os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/bulldozers-raw')

#### START FROM HERE - PREPROCESSED

In [3]:
Working_df = pd.read_feather('tmp/bulldozers-raw')

In [4]:
#Splitting into numeric and non numeric data types
quantitative = [f for f in Working_df.columns if is_numeric_dtype(Working_df[f])]
qualitative = Working_df.columns.difference(quantitative)
#Do not replace and Id column and the dependent variable
quantitative.remove('SalesID')
quantitative.remove('SalePrice')


##### REPLACE MISSING VALUES OF NUMERIC DTYPE

In [5]:
#Replace missing values
def fix_missing(df,col):
    if is_numeric_dtype(df[col]):
        df[col+'_na']= pd.isnull(df[col])
        df[col]= df[col].fillna(df[col].median())

In [6]:
for i in quantitative:
    fix_missing(Working_df,i)

##### REPLACE CATEGORICAL VALUES WITH THEIR CODES

In [7]:
#Replace Categorical variable with their codes
def numericalize(df,col):
    if not is_numeric_dtype(df[col]):
        df[col+'_num']= df[col].cat.codes+1

In [8]:
for i in qualitative:
    numericalize(Working_df,i)
    #dropping the categorical column after replacing
    Working_df.drop(i,axis =1,inplace=True)

##### GETTING THE Y VALUE

In [9]:
y = Working_df['SalePrice']
Working_df.drop('SalePrice',axis=1,inplace=True)


In [10]:
#calling the Random forest regressor
m = RandomForestRegressor(n_jobs=-1)
m.fit(Working_df, y)
m.score(Working_df, y)


0.9830621148142004

In [11]:
#extracting subset for processing
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 12000  # same as Kaggle's test set size
n_trn = len(Working_df)-n_valid
raw_train, raw_valid = split_vals(Working_df, n_trn)
X_train, X_valid = split_vals(Working_df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((389125, 84), (389125,), (12000, 84))

#### RMSE

In [12]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())


In [13]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [14]:
#Base model with training and validation set
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train,y_train)
print_score(m)

[0.09057441008648796, 0.2476381719628013, 0.9828546697677856, 0.8904825929404404]


In [34]:
#Finding the features which are better
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(m.feature_importances_,3)})
print (importances[importances['importance'] > np.mean(m.feature_importances_)].sort_values('importance',ascending=False))


                   feature  importance
46      Coupler_System_num       0.198
5                 YearMade       0.175
61         ProductSize_num       0.108
81  fiProductClassDesc_num       0.102
2                  ModelID       0.094
20             saleElapsed       0.078
49           Enclosure_num       0.039
78         fiModelDesc_num       0.029
82     fiSecondaryDesc_num       0.027
0                  SalesID       0.026
77         fiBaseModel_num       0.018
79   fiModelDescriptor_num       0.017
1                MachineID       0.013


In [29]:
importances.columns

Index(['importance'], dtype='object')

In [14]:
#Loading the test set
Test_df = pd.read_csv('data/Test.csv',low_memory=False ,parse_dates=['saledate'])
#Processing of Test Data
add_datepart(Test_df,'saledate')
conv_to_cats(Test_df)
#Ordinal type handling
Test_df.UsageBand.cat.set_categories(['High','Medium','Low'],ordered = True,inplace = True)
Test_df.UsageBand = Test_df.UsageBand.cat.codes

for i in quantitative:
    fix_missing(Test_df,i)
for i in qualitative:
    numericalize(Test_df,i)
    #dropping the categorical column after replacing
    Test_df.drop(i,axis =1,inplace=True)

In [34]:
#Scoring  the model to test set
y_pred =np.exp(m.predict(Test_df))

In [35]:
#Writing the output
predictions_file = open("data/output.csv", "w",newline='')
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(["SalesID", "SalePrice"])
for i in range(len(Test_df)):
    predictions_file_object.writerow([Test_df['SalesID'][i],y_pred[i]])
predictions_file.close()