In [1]:
# Imports.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.feature_selection import RFE

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [2]:
# train file read in
df_randomForest = pd.read_csv('CleanedTrain.csv')

In [3]:
df_randomForest.shape

(1460, 75)

<b> Select Features:</b><br>
I read <a href='http://blog.datadive.net/selecting-good-features-part-iii-random-forests/'>something about selecting features</a> online gives me these ideas:<br>
- feature selection based on impurity reduction is biased towards preferring variables with more categories.
- when the dataset has two (or more) correlated features, then from the point of view of the model, any of these correlated features can be used as the predictor, with no concrete preference of one over the others.
<br>The second one inspired me to check to correlation of dataset and drop those really correlated columns, which potentially provides duplicate informations.

In [4]:
def dropCorColumns(dataframe,threshold=0.8):
    '''Take a dataframe, get dummies, calculate correlation between all the combination, drop one of the column in 
    high correlation pairs.'''
    dataframe_dummies = pd.get_dummies(dataframe,drop_first=True)
    correlations = []
    columns = dataframe_dummies.columns.tolist()

    for col_a, col_b in itertools.combinations(columns, 2):
        correlations.append([col_a, col_b, dataframe_dummies[col_a].corr(dataframe_dummies[col_b])])

    result = [i for i in correlations if i[2]>=threshold]
        #result = DataFrame.from_dict(correlations, orient='index')
    #result.columns = ['PCC', 'p-value']

    #print(result)
    dropList = [i[0] for i in result]
    df = dataframe_dummies.drop(dataframe_dummies[dropList],axis=1)
    return df

<b>Missing Values:</b>

In [5]:
hasNullValues = []
for f in df_randomForest.columns:
    if df_randomForest[f].isnull().sum() > 0:
        hasNullValues.append(f)
        #print(f,"Missing:",df_randomForest[f].isnull().sum())
        print("{0:15} Missing: {1:3}".format(f, df_randomForest[f].isnull().sum()))

LotFrontage     Missing: 259
MasVnrType      Missing:   8
MasVnrArea      Missing:   8
BsmtQual        Missing:  37
BsmtCond        Missing:  37
BsmtExposure    Missing:  38
BsmtFinType1    Missing:  37
BsmtFinType2    Missing:  38
Electrical      Missing:   1
GarageType      Missing:  81
GarageYrBlt     Missing:  81
GarageFinish    Missing:  81
GarageQual      Missing:  81
GarageCond      Missing:  81


These missing values is quite annoying...my basic idea is to either impute or drop rows.

# Experiment 1: Imputation with most frequently values

In [6]:
#df_randomForest[df_randomForest.isnull().any(axis=1)]

In [7]:
#df_randomForest.describe().T

In [8]:
#categorical_features=['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition', 'MSSubClass', 'MoSold', 'OverallQual', 'OverallCond']

In [9]:
# Code from:https://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

<b>Impute nan value with mean, categorical features have string which can not be impute properly. Here's some hack from stackoverflow, works perfectly.

In [10]:
df_randomForest_imp = DataFrameImputer().fit_transform(df_randomForest)
#imp.fit(df_randomForest[categorical_features])
#df_randomForest[categorical_features].isnull().sum()
isnull = [ i for i in df_randomForest_imp.columns if df_randomForest_imp[i].isnull().sum() != 0]
print(isnull)
df_randomForest_imp.shape

[]


(1460, 75)

<b>Based on steps I need to take on training models and prediction, I wrote a function to play around. It will do things below:<br>
- Change all the features to dummies, drop one of the dummies of every category
- Take a column name and save that as a target label, drop it and save the rest as features.
- Split training and test set, by default, it's 0.7.
- Standardize the dataset.
- Train the dataset with random forest model, by default oob=True, treeNumber=500.
- Print evaluations. Including: feature importance, oob score, mse, mae, r2, explained variance score and accuracy.

In [11]:
def randomForesty(dataframe,targetColumn,trainSize=0.7,treeNumber=500,oob=True,randomState=47):
    '''Take a raw dataframe, produce dummies, separate target and feature sets, do model training, testing '''
    
    #Get dummies
    dataframe_dummies = pd.get_dummies(dataframe,drop_first=True)
    
    #Separate target and feature set.
    target = dataframe_dummies[targetColumn]
    features = dataframe_dummies.drop(targetColumn,axis=1)
    
    #Train, test set separation
    X_train, X_test, y_train, y_test = train_test_split(features, target, train_size=trainSize, random_state=randomState)
    
    #Standardization
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values, columns=X_train.columns.values)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)
    
    #Foresty
    rf = RandomForestRegressor(n_estimators=treeNumber, oob_score=oob, random_state=randomState)
    rf.fit(X_train, y_train)
    
    #Prediction
    #print('\n','-'*20,'Prediction','-'*20)
    predicted_train = rf.predict(X_train)
    predicted_test = rf.predict(X_test)
    #print("Prediction of Train set:",predicted_train,"\nPrediction of Test set:",predicted_test)
    
#    return rf,predicted_train,predicted_test

#def evaluation(rf, predicted_train, predicted_test):
    '''Take a random forest model and evaluation(mean_squared_error, mean_absolute_error, explained_variance_score, 
    accuracy and r2_score) in one go. '''
    #Feature importance
    print('\n','-'*20,'Feature Importances','-'*20,'\n')
    print(pd.DataFrame({'feature': X_train.columns, 'importance': rf.feature_importances_}).sort_values(by=['importance'],ascending=False).head(10))
    
    #OOB
    print('\n','-'*20,'OOB','-'*20,'\n')
    print("OOB Score:                       ",rf.oob_score_)
    
    #MSE
    #print("{0:15} Missing: {1:3}".format(f, df_randomForest[f].isnull().sum()))
    print('\n','-'*20,'MSE','-'*20,'\n')
    #print("Training set: {0:10}".format(metrics.mean_squared_error(y_train, predicted_train)))
    print("Training set:                    ",metrics.mean_squared_error(y_train, predicted_train))
    print("Test set:                        ",metrics.mean_squared_error(y_test, predicted_test))
    
    #MAE
    print('\n','-'*20,'MAE','-'*20,'\n')
    trainMean = y_train.mean()
    testMean = y_test.mean()
    print("Training set dummy result (mean):",trainMean,"\nTest set dummy result (mean):    ",testMean)
    
    print("Training set:")
    baseline_errors = abs(trainMean - y_train)
    print('Average baseline error:          ', round(np.mean(baseline_errors), 2))

    rf_errors = abs(predicted_train - y_train)
    print('Mean Absolute Error:             ', round(np.mean(rf_errors), 2))
    
    print("Test set:")
    # Baseline errors, and display average baseline error
    baseline_errors = abs(testMean - y_test)
    print('Average baseline error:          ', round(np.mean(baseline_errors), 2))

    rf_errors = abs(predicted_test - y_test)
    print('Mean Absolute Error:             ', round(np.mean(rf_errors), 2))
    
    #R2
    print('\n','-'*20,'R-Squared','-'*20,'\n')
    train_score = metrics.r2_score(y_train, predicted_train)
    print("Traning set R2 score:            ",train_score)
    test_score = metrics.r2_score(y_test, predicted_test)
    print("Test set R2 score:               ",test_score)
    
    #Explained variance score
    print('\n','-'*20,'Explained Variance Score','-'*20,'\n')
    print("EVS of training set:             ",metrics.explained_variance_score(y_train, predicted_train))
    print("EVS of test set:                 ",metrics.explained_variance_score(y_test, predicted_test))
    
    #Accuracy
    print('\n','-'*20,'Accuracy','-'*20,'\n')
    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (rf_errors / y_test)

    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    print('Accuracy:                         ', round(accuracy, 2), '%.')
    #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [12]:
randomForesty(df_randomForest_imp,'SalePrice')




 -------------------- Feature Importances -------------------- 

        feature  importance
3   OverallQual    0.560367
15    GrLivArea    0.154650
11  TotalBsmtSF    0.033275
8    BsmtFinSF1    0.028228
18     FullBath    0.025425
13     2ndFlrSF    0.019507
26   GarageArea    0.016544
12     1stFlrSF    0.015930
2       LotArea    0.014303
5     YearBuilt    0.011752

 -------------------- OOB -------------------- 

OOB Score:                        0.8848747999018675

 -------------------- MSE -------------------- 

Training set:                     96671783.68770394
Test set:                         1589199139.9886506

 -------------------- MAE -------------------- 

Training set dummy result (mean): 181420.80215475024 
Test set dummy result (mean):     179759.24145785876
Training set:
Average baseline error:           57133.01
Mean Absolute Error:              6239.5
Test set:
Average baseline error:           58124.24
Mean Absolute Error:              19843.56

 ---------------

<b>[Discussion]<br>
By the result shown above, the most outstanding feature is <font color='green'>OverallQual</font>.<br>Quite decent OOB score and accuracy.<br>
    The only thing I'm not really happy about is the R-squared dropped from 0.98(training set) to 0.75 (test set).

# Experiment 2: Drop NaN rows

In [13]:
df_randomForest_drop = df_randomForest.dropna()
df_randomForest_drop.shape

(1094, 75)

In [14]:
randomForesty(df_randomForest_drop,'SalePrice')




 -------------------- Feature Importances -------------------- 

          feature  importance
3     OverallQual    0.597635
15      GrLivArea    0.100093
12       1stFlrSF    0.027518
18       FullBath    0.023509
11    TotalBsmtSF    0.022115
8      BsmtFinSF1    0.020722
13       2ndFlrSF    0.020390
26     GarageArea    0.016389
2         LotArea    0.014822
143  ExterQual_Gd    0.013922

 -------------------- OOB -------------------- 

OOB Score:                        0.8519306449969073

 -------------------- MSE -------------------- 

Training set:                     134870742.502086
Test set:                         1561218854.430377

 -------------------- MAE -------------------- 

Training set dummy result (mean): 186805.0522875817 
Test set dummy result (mean):     187563.90577507598
Training set:
Average baseline error:           59886.9
Mean Absolute Error:              6683.26
Test set:
Average baseline error:           61845.68
Mean Absolute Error:              21032.4

<b>[Discussion]<br>
    By comparing the evaluation above, I tend to pick the drop version. The main reason is like mentioned above, I don't like the quick drop of r-squared from training set to test set, which maked me doubt that my imputation skewed the dataset too much.
    <br>Also, the accuracy raised a little bit...<br>
    Thirdly, the feature importance seems more distinguishing...

# Experiment 3: Imputation + drop corr columns

As mentioned above, correlated columns could influence random forest to make decision. If two columns contains similar information, there is no need to maintain it. Try to drop one.

In [15]:
df_randomForest_noCorr = dropCorColumns(df_randomForest_imp)

In [16]:
df_randomForest_noCorr.shape

(1460, 221)

In [17]:
randomForesty(df_randomForest_noCorr,'SalePrice')




 -------------------- Feature Importances -------------------- 

         feature  importance
3    OverallQual    0.600763
15      FullBath    0.070393
11      1stFlrSF    0.042736
8     BsmtFinSF1    0.035932
22    GarageArea    0.034294
2        LotArea    0.032007
7     MasVnrArea    0.016526
19  TotRmsAbvGrd    0.014203
1    LotFrontage    0.011811
20    Fireplaces    0.011500

 -------------------- OOB -------------------- 

OOB Score:                        0.8538957199662457

 -------------------- MSE -------------------- 

Training set:                     121463736.11743513
Test set:                         1447086000.2753668

 -------------------- MAE -------------------- 

Training set dummy result (mean): 181420.80215475024 
Test set dummy result (mean):     179759.24145785876
Training set:
Average baseline error:           57133.01
Mean Absolute Error:              6758.41
Test set:
Average baseline error:           58124.24
Mean Absolute Error:              21051.1

 ---

<b>[Discussion]<br>
    The OverallQual feature is more outstanding now, but I'm still not very happy about the drop of R-squared.

# Experiment 4: drop NaN + drop corr columns

In [18]:
df_randomForest_noCorrNaN = dropCorColumns(df_randomForest_drop)

In [19]:
randomForesty(df_randomForest_noCorrNaN,'SalePrice')




 -------------------- Feature Importances -------------------- 

          feature  importance
3     OverallQual    0.618238
10       1stFlrSF    0.038444
14       FullBath    0.037561
21     GarageArea    0.035667
2         LotArea    0.030045
7      BsmtFinSF1    0.029106
6      MasVnrArea    0.023866
18   TotRmsAbvGrd    0.021657
131  ExterQual_Gd    0.021259
1     LotFrontage    0.011790

 -------------------- OOB -------------------- 

OOB Score:                        0.8321706661757279

 -------------------- MSE -------------------- 

Training set:                     152599352.7596116
Test set:                         1880055359.3620617

 -------------------- MAE -------------------- 

Training set dummy result (mean): 186805.0522875817 
Test set dummy result (mean):     187563.90577507598
Training set:
Average baseline error:           59886.9
Mean Absolute Error:              7261.61
Test set:
Average baseline error:           61845.68
Mean Absolute Error:              22435

<b>[Discussion]<br>
    Very interesting, all the other feature's importance dropped and OverallQual becomes very outstanding. I'm really curious to try how the performance will be alone.

# Experiment 5: OverallQual on its own (just for fun)

In [20]:
randomForesty(df_randomForest[['OverallQual','SalePrice']],'SalePrice')




 -------------------- Feature Importances -------------------- 

       feature  importance
0  OverallQual         1.0

 -------------------- OOB -------------------- 

OOB Score:                        0.6870210341455807

 -------------------- MSE -------------------- 

Training set:                     1873300307.7083793
Test set:                         2334506927.814881

 -------------------- MAE -------------------- 

Training set dummy result (mean): 181420.80215475024 
Test set dummy result (mean):     179759.24145785876
Training set:
Average baseline error:           57133.01
Mean Absolute Error:              30306.13
Test set:
Average baseline error:           58124.24
Mean Absolute Error:              32056.47

 -------------------- R-Squared -------------------- 

Traning set R2 score:             0.7001491582729162
Test set R2 score:                0.6376613040333359

 -------------------- Explained Variance Score -------------------- 

EVS of training set:              0.

# For kaggle submission

## can't drop NaN in test set...

In [21]:
df_randomForest_test = pd.read_csv('CleanedTest.csv')
#print(df_randomForest_test.shape)
df_randomForest_test = DataFrameImputer().fit_transform(df_randomForest_test)
df_randomForest_test = pd.get_dummies(df_randomForest_test,drop_first=True)
#df_randomForest_test[df_randomForest_test.isnull().any(axis=1)].shape
#df_randomForest_test.shape
df_randomForest_test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_TA,BsmtCond_Gd,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasW,Heating_Grav,Heating_Wall,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_P,PavedDrive_Y,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,882.0,896,0,0,896,0.0,0.0,1,0,2,1,5,0,1961.0,1.0,730.0,140,0,0,0,120,0,0,6,2010,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,0,0,1329,0.0,0.0,1,1,3,1,6,0,1958.0,1.0,312.0,393,36,0,0,0,0,12500,6,2010,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,928.0,928,701,0,1629,0.0,0.0,2,1,3,1,6,1,1997.0,2.0,482.0,212,34,0,0,0,0,0,3,2010,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,324.0,926.0,926,678,0,1604,0.0,0.0,2,1,3,1,7,1,1998.0,2.0,470.0,360,36,0,0,0,0,0,6,2010,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,0,0,1280,0.0,0.0,2,0,2,1,5,0,1992.0,2.0,506.0,0,82,0,0,144,0,0,1,2010,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0


In [22]:
# The test set lack of some columns in training set...Try to impute columns with 0...
impList = pd.get_dummies(df_randomForest_imp,drop_first=True).columns.tolist()
testDontHave = [x for x in impList if x not in df_randomForest_test.columns.tolist()]
testDontHave.remove('SalePrice')
#print(testDontHave)
for x in testDontHave:
    df_randomForest_test[x] = pd.Series(np.zeros(df_randomForest_test.shape[0]))#, index=df1.index
#df_randomForest_test.head()
df_randomForest_id = df_randomForest_test['Id']
df_randomForest_test = df_randomForest_test.drop('Id',axis=1)

In [23]:
# Train with df_randomForest_drop 'SalePrice'
def prediction(df_train,label,df_test,treeNumber=500,oob=True,randomState=47):
    '''Use a df to predict the other set. Return a df of predict result.'''
    
    df_dummies = pd.get_dummies(df_train,drop_first=True)
    #print(df_dummies.shape)
    #Separate target and feature set.
    target = df_dummies[label]
    features = df_dummies.drop(label,axis=1)

    #Standardization
    scaler = StandardScaler().fit(features)
    X_scaled = pd.DataFrame(scaler.transform(features), index=features.index.values, columns=features.columns.values)
    #X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)

    #Foresty
    rf = RandomForestRegressor(n_estimators=treeNumber, oob_score=oob, random_state=randomState)
    rf.fit(X_scaled, target)

    #Prediction
    #print('\n','-'*20,'Prediction','-'*20)
    predicted = rf.predict(df_test)
    return predicted

In [24]:
result = pd.DataFrame(prediction(df_randomForest_imp,'SalePrice',df_randomForest_test),columns=['SalePrice'])

In [25]:
result['Id'] = df_randomForest_id

In [26]:
result.head()

Unnamed: 0,SalePrice,Id
0,407353.72,1461
1,396212.524,1462
2,586967.138,1463
3,599032.496,1464
4,400547.856,1465


In [27]:
result.to_csv('impute.csv',index=False)

![My score](Screenshot from 2018-04-20 10-26-44.png)

References:<br>
<a>http://www.blopig.com/blog/2017/07/using-random-forests-in-python-with-scikit-learn/</a><br>
<a>https://towardsdatascience.com/random-forest-in-python-24d0893d51c0</a><br>
<a>http://scikit-learn.org/stable/modules/model_evaluation.html</a>
<a>http://blog.datadive.net/selecting-good-features-part-iii-random-forests/</a>
<a>https://stackoverflow.com/questions/33997753/calculating-pairwise-correlation-among-all-columns</a>