In [1]:
# Imports.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.tree import export_graphviz

In [2]:
# train file read in
df_randomForest = pd.read_csv('CleanedTrain.csv')

In [3]:
df_randomForest.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [4]:
df_randomForest.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0
BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.0,0.0,0.0,1474.0


In [5]:
categorical_features=['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition', 'MSSubClass', 'MoSold', 'OverallQual', 'OverallCond']

In [6]:
# Code from:https://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

Impute nan value with mean, categorical features have string which can not be impute properly. Here's some hack from stackoverflow, works perfectly.

In [18]:
df_randomForest = DataFrameImputer().fit_transform(df_randomForest)
#imp.fit(df_randomForest[categorical_features])
#df_randomForest[categorical_features].isnull().sum()
isnull = [ i for i in df_randomForest.columns if df_randomForest[i].isnull().sum() != 0]
print(isnull)

[]


Change all the features to dummies, drop one of the dummies of every category

In [8]:
df_randomForest_dummies = pd.get_dummies(df_randomForest,drop_first=True)
df_randomForest_dummies.shape

(1460, 233)

The best part of random forest is we don't need to select features manually.

In [9]:
target = df_randomForest_dummies['SalePrice']
features = df_randomForest_dummies.drop('SalePrice',axis=1)
features.shape

(1460, 232)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(features, target, train_size=0.7, random_state=47)



In case our data being totally dominated by some columns with large number or too centralized problem, let's standardize.<br>
While it's working, it's being changed to numpy array. If want to view like a dataframe, need to change back.

In [12]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values, columns=X_train.columns.values)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)
X_test_scaled.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
903,-0.8713,-0.977096,0.590626,0.647422,-0.540186,1.144613,1.018747,-0.430597,-1.011928,-0.285256,...,-0.062715,-0.070152,3.256113,-0.044302,-2.467849,-0.054286,-0.062715,-0.11791,-2.147065,3.236033
1454,-0.8713,-0.382772,-0.344544,0.647422,-0.540186,1.077824,0.970713,-0.577139,-0.054866,-0.285256,...,-0.062715,-0.070152,-0.307115,-0.044302,0.405211,-0.054286,-0.062715,-0.11791,0.465752,-0.30902
629,0.561416,0.607769,-0.151385,-0.085414,-0.540186,-0.257967,-0.998705,0.828584,0.444673,1.8422,...,-0.062715,-0.070152,-0.307115,-0.044302,0.405211,-0.054286,-0.062715,-0.11791,0.465752,-0.30902
496,-0.8713,0.015919,0.315247,1.380259,-0.540186,0.677086,0.394298,-0.577139,1.861591,-0.285256,...,-0.062715,-0.070152,-0.307115,-0.044302,0.405211,-0.054286,-0.062715,-0.11791,0.465752,-0.30902
1367,2.471705,-1.42284,-0.958969,-0.81825,0.361007,0.176165,-0.374256,-0.577139,0.267267,0.81013,...,-0.062715,-0.070152,-0.307115,-0.044302,0.405211,-0.054286,-0.062715,-0.11791,0.465752,-0.30902


In [13]:
rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

Let's see how our forest doing.

In [15]:
pd.DataFrame({'feature': X_train.columns, 'importance': rf.feature_importances_}).sort_values(by=['importance'],ascending=False)

Unnamed: 0,feature,importance
3,OverallQual,5.591513e-01
15,GrLivArea,1.503952e-01
11,TotalBsmtSF,3.240751e-02
8,BsmtFinSF1,3.015992e-02
18,FullBath,2.783137e-02
13,2ndFlrSF,2.164364e-02
26,GarageArea,1.726735e-02
12,1stFlrSF,1.550458e-02
2,LotArea,1.434359e-02
5,YearBuilt,1.133509e-02


As can be seen above, the most important feature is **<font color='green'>OverallQual</font>**.

In [16]:
predicted_train = rf.predict(X_train)
train_score = metrics.r2_score(y_train, predicted_train)
print("Traning set R2 score:",train_score)

Traning set R2 score: 0.9847298314883987


Astonishing result from training set... What about test set...

In [17]:
predicted_test = rf.predict(X_test)
test_score = metrics.r2_score(y_test, predicted_test)
print("Test set R2 score:",test_score)

Test set R2 score: 0.7544043756599392


In [14]:
trainMean = y_train.mean()
testMean = y_test.mean()
print("Training set dummy result:",trainMean,"\nTest set dummy result:",testMean)

Training set dummy result: 181420.80215475024 
Test set dummy result: 179759.24145785876


In [24]:
print("Training set:")
baseline_errors = abs(trainMean - y_train)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

rf_errors = abs(predicted_train - y_train)
print('Mean Absolute Error:', round(np.mean(rf_errors), 2))

Training set:
Average baseline error:  57133.01
Mean Absolute Error: 6214.2


In [25]:
print("Test set:")
# Baseline errors, and display average baseline error
baseline_errors = abs(testMean - y_test)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

rf_errors = abs(predicted_test - y_test)
print('Mean Absolute Error:', round(np.mean(rf_errors), 2))

Test set:
Average baseline error:  58124.24
Mean Absolute Error: 19596.46


As can be seen above, my random forest is better than guessing with the average price.

Other evaluations(on test set):

References:<br>
<a>http://www.blopig.com/blog/2017/07/using-random-forests-in-python-with-scikit-learn/</a>