In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder, StandardScaler, MaxAbsScaler, QuantileTransformer
from sklearn import preprocessing

# PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# k-fold
from sklearn.model_selection import KFold

# models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV

# RMSE
from math import sqrt
from sklearn.metrics import mean_squared_error

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df=pd.read_csv("train.csv")
train_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000



# Feature Engineering

### Pearson Correlation

In [1637]:
temp = train_df.drop(['SalePrice'],1)
c1 = temp.corr().abs().unstack()
corlist = c1.sort_values(ascending = False)

corlist[39:46]

YearBuilt     GarageYrBlt     0.825667
GarageYrBlt   YearBuilt       0.825667
GrLivArea     TotRmsAbvGrd    0.825489
TotRmsAbvGrd  GrLivArea       0.825489
1stFlrSF      TotalBsmtSF     0.819530
TotalBsmtSF   1stFlrSF        0.819530
GrLivArea     2ndFlrSF        0.687501
dtype: float64

In [1638]:
#Correlation with output variable
subset = train_df[['SalePrice','YearBuilt','GarageYrBlt','GrLivArea','TotRmsAbvGrd','1stFlrSF','TotalBsmtSF']]
cor = subset.corr()
cor['SalePrice']

SalePrice       1.000000
YearBuilt       0.522897
GarageYrBlt     0.486362
GrLivArea       0.708624
TotRmsAbvGrd    0.533723
1stFlrSF        0.605852
TotalBsmtSF     0.613581
Name: SalePrice, dtype: float64

In [1639]:
train_df = train_df.drop(['GarageYrBlt','TotRmsAbvGrd','1stFlrSF'],1)

### Check for NaN 

In [1640]:
# NaN数据的比例
(train_df.isnull().sum()/len(train_df)).sort_values(ascending=False)[:6]

PoolQC         0.995205
MiscFeature    0.963014
Alley          0.937671
Fence          0.807534
FireplaceQu    0.472603
LotFrontage    0.177397
dtype: float64

In [1641]:
train_df['HasPool'] = train_df['PoolQC'].apply(lambda x: 0 if type(x) == float else 1)
train_df['HasMisc'] = train_df['MiscFeature'].apply(lambda x: 0 if type(x) == float else 1)
train_df['HasAlley'] = train_df['Alley'].apply(lambda x: 0 if type(x) == float else 1)
train_df['HasFence'] = train_df['Fence'].apply(lambda x: 0 if type(x) == float else 1)
train_df['HasFireplace'] = train_df['FireplaceQu'].apply(lambda x: 0 if type(x) == float else 1)
cor = abs(train_df[['HasPool', 'HasMisc', 'HasAlley', 'HasFence', 'HasFireplace', 'SalePrice']].corr())
cor['SalePrice']

HasPool         0.093708
HasMisc         0.072298
HasAlley        0.123611
HasFence        0.172184
HasFireplace    0.471908
SalePrice       1.000000
Name: SalePrice, dtype: float64

In [1642]:
# delete first 4 columns
train_df.drop(['PoolQC','MiscFeature','Alley','Fence'],inplace=True, axis=1, errors='ignore')
train_df.drop(['HasPool','HasMisc','HasAlley','HasFence','HasFireplace'],inplace=True, axis=1, errors='ignore')
train_df.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500


In [1643]:
# train_df.fillna(0, inplace=True)

### Combine and Check Correlation

In [1644]:
train_df['LivAreaTotal'] = train_df['GrLivArea'] + train_df['GarageArea'] + train_df['PoolArea'] + train_df['TotalBsmtSF']

# check for corr
subset = train_df[['SalePrice','LivAreaTotal','GrLivArea','GarageArea','PoolArea','TotalBsmtSF']]
cor = abs(subset.corr())
cor["SalePrice"]

SalePrice       1.000000
LivAreaTotal    0.805162
GrLivArea       0.708624
GarageArea      0.623431
PoolArea        0.092404
TotalBsmtSF     0.613581
Name: SalePrice, dtype: float64

In [1645]:
# drop
train_df = train_df.drop(['GrLivArea','GarageArea','PoolArea','TotalBsmtSF'],1)

In [1646]:
train_df.shape

(1460, 71)

In [1647]:
# Correlation with output variable
cor = train_df.corr()
cor_target = abs(cor["SalePrice"])

# delete low correlated features
relevant_features = cor_target[cor_target<=0.1]
train_df = train_df.drop(relevant_features.index,1)
train_df.shape

(1460, 61)

### PCA on All Numerical Variables

In [1648]:
# Separate explanatory variables
train = train_df.drop(['SalePrice'],1)
train.head(1)
train.fillna(0, inplace=True)

In [1649]:
# Standardizing the features
features = train.select_dtypes(exclude=['object']).columns
train = train[features].values
train_std = StandardScaler().fit_transform(train)

In [1650]:
train_std.shape

(1460, 21)

In [1651]:
np.mean(train_std),np.std(train_std)

(3.198137558215715e-16, 1.0)

In [1652]:
# let PCA captures 90% variance
pca = PCA(0.9)
pca.fit(train_std)

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [1653]:
# find out how many n_components PCA used to capture 0.9 variance
pca.n_components_

14

In [1654]:
train_pca = pca.transform(train_std)
train_pca.shape

(1460, 14)

In [1655]:
train = pd.DataFrame(data = train_pca, columns = [list(range(1,15))])
train.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.563459,0.466699,0.078027,-1.625155,-1.075509,-0.885111,-1.2039,-0.028279,0.438726,-0.083208,0.819161,-0.469644,0.542025,0.007236
1,0.23049,0.881249,-0.013087,0.747355,-0.553565,0.670508,0.894619,-0.252252,0.61357,-0.826107,-0.874016,-0.31836,-1.010691,0.505463
2,1.907549,0.095293,0.277855,-1.152475,-0.366943,-0.383556,-0.715507,-0.128734,0.266656,-0.298812,1.109789,0.077123,-0.407693,0.09108
3,-0.779254,-0.524841,1.974474,1.393821,0.748371,1.850984,-2.946842,-0.552892,-1.727853,-0.872539,0.184551,0.603346,0.419091,-1.23157
4,3.944063,-0.112396,1.0383,-0.655653,-0.675685,0.051958,-0.422083,-0.402577,0.741357,0.16302,-0.068847,-0.186086,0.098405,-0.503282


In [1656]:
# Append
# find categorical columns
cat = train_df[train_df.select_dtypes(exclude=['int', 'float']).columns]
train_df = pd.concat([train, cat, train_df['SalePrice']], axis=1)
train_df.head(1)

Unnamed: 0,"(1,)","(2,)","(3,)","(4,)","(5,)","(6,)","(7,)","(8,)","(9,)","(10,)",...,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition,SalePrice
0,1.563459,0.466699,0.078027,-1.625155,-1.075509,-0.885111,-1.2039,-0.028279,0.438726,-0.083208,...,Typ,,Attchd,RFn,TA,TA,Y,WD,Normal,208500


In [1657]:
# element wise function to transform
train_df['above_200k'] = train_df['SalePrice'].map(lambda x : 1 if x > 200000 else 0) 
train_df['above_200k'] = train_df['above_200k'].astype('category')

### Dealing with Categorical Variables

In [1658]:
# # factorize
# catcol = train_df.select_dtypes(exclude=['int', 'float']).columns
# train_df[catcol] = train_df[catcol].apply(lambda x: pd.factorize(x)[0])
# train_df.head(1)

In [1659]:
# dummy
train_df = pd.get_dummies(train_df)
train_df.head(1)

Unnamed: 0,"(1,)","(2,)","(3,)","(4,)","(5,)","(6,)","(7,)","(8,)","(9,)","(10,)",...,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,above_200k_0,above_200k_1
0,1.563459,0.466699,0.078027,-1.625155,-1.075509,-0.885111,-1.2039,-0.028279,0.438726,-0.083208,...,0,1,0,0,0,0,1,0,0,1


### Take log on Response Variable

In [1662]:
train_df['LogSalePrice'] = np.log(train_df['SalePrice'])
train_df.head(1)

Unnamed: 0,"(1,)","(2,)","(3,)","(4,)","(5,)","(6,)","(7,)","(8,)","(9,)","(10,)",...,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,above_200k_0,above_200k_1,LogSalePrice
0,1.563459,0.466699,0.078027,-1.625155,-1.075509,-0.885111,-1.2039,-0.028279,0.438726,-0.083208,...,1,0,0,0,0,1,0,0,1,12.247694


# Find Best Random Forest Parameter

In [1693]:
# Split training and test set
features = [col for col in train_df.columns if 'Price' not in col]
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['LogSalePrice'], test_size = 0.3)

In [1694]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 30, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [1695]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=20, bootstrap=True 
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=20, bootstrap=True 
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=20, bootstrap=True 
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=20, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=20, bootstrap=True, total=   0.4s
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=20, bootstrap=True, total=   0.4s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=20, bootstrap=False 
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=20, bootstrap

[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=False, total=   0.7s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.3s


[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=False, total=   0.7s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=None, bootstrap=True, total=   1.9s
[CV] n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=None, bootstrap=True, total=   1.9s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True, total=   0.4s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=100, min_

[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=False, total=   4.8s
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=False, total=   4.9s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True, total=   0.6s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True, total=   0.6s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=200, min_sam

[CV]  n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True, total=   0.4s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=True 
[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=True, total=   3.8s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=True 
[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=True, total=   4.0s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=True 
[CV]  n_estimators=300, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=False, total=   8.1s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=20, bootstrap=False 
[CV]  n_estimators=300, m

[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=20, bootstrap=True, total=   4.4s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False 
[CV]  n_estimators=300, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False, total=   1.2s
[CV] n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False 
[CV]  n_estimators=300, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False, total=   1.2s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=True, total=   3.1s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=300, m

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min


[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True, total=   4.4s
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True, total=   4.4s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=30, bootstrap=True, total=   1.4s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=True, total=   4.7s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=30, bootstrap=True 
[CV]  n_estimators=300, min_sa

[CV]  n_estimators=300, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=30, bootstrap=False, total=   7.7s
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False, total=   4.4s
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False, total=   4.5s
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False, total=   4.5s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=300, min_s

[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=10, bootstrap=True, total=   0.5s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=10, bootstrap=True, total=   0.5s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=10, bootstrap=True, total=   0.5s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False, total=   0.4s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=1, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=

[CV]  n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False, total=   6.6s
[CV] n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=20, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=True, total=   2.7s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=20, bootstrap=True, total=   4.7s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=300, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=20, bootstrap=True, total=   4.7s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=False 
[CV]  n_estimators=300, min_

[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=False, total=   4.8s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True, total=   0.6s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True, total=   0.5s
[CV] n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True 
[CV]  n_estimators=200, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=30, bootstrap=True, total=   0.5s
[CV] n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=False 
[CV]  n_estimators=200,

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.8min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [100, 200, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, None], 'min_samples_split': [5, 10], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [1696]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 200}

# 5-fold CV

In [1663]:
features = [col for col in train_df.columns if 'Price' not in col]
target = ['LogSalePrice']

In [1690]:
rf = rf_random.best_estimator_
gbm = GradientBoostingRegressor()
ada = AdaBoostClassifier()
ridgeReg = Ridge(alpha=0.05, normalize=True)
lassoReg = Lasso(alpha=0.03, normalize=True)

rf_rmse = []
gbm_rmse = []
ada_rmse = []
ridgeReg_rmse = []
lassoReg_rmse = []

In [1691]:
cv = KFold(n_splits=5, shuffle=False)

for train_index, test_index in cv.split(train_df):
    
    X_train = train_df.iloc[train_index][features]
    X_test = train_df.iloc[test_index][features]
    y_train = train_df.iloc[train_index][target]
    y_test = train_df.iloc[test_index][target]
    
    # random forest
    rf.fit(X_train, y_train.values.ravel())
    pred = rf.predict(X_test)
    rf_rmse.append(sqrt(mean_squared_error(pred, y_test)))
    
    # gradient boosting
    gbm.fit(X_train, y_train.values.ravel())
    pred = gbm.predict(X_test)
    gbm_rmse.append(sqrt(mean_squared_error(pred, y_test)))
    
    # adaboost
    le = preprocessing.LabelEncoder()                       # label encoding
    y_train_ada = le.fit_transform(y_train.values.ravel())  # label encoding
    ada.fit(X_train, y_train_ada)
    pred = ada.predict(X_test)
    pred = le.inverse_transform(pred)                       # encoding back to original label
    ada_rmse = sqrt(mean_squared_error(pred, y_test))
    
    # ridge regression
    ridgeReg.fit(X_train, y_train)
    pred = ridgeReg.predict(X_test)
    ridgeReg_rmse.append(sqrt(mean_squared_error(pred, y_test)))

    # LASSO
    lassoReg.fit(X_train,y_train)
    pred = lassoReg.predict(X_test)
    lassoReg_rmse.append(sqrt(mean_squared_error(pred, y_test)))

In [1692]:
np.mean(rf_rmse), np.mean(gbm_rmse), np.mean(ada_rmse), np.mean(ridgeReg_rmse), np.mean(lassoReg_rmse)

(0.12918349014563577,
 0.12504900142873704,
 0.26688554535575393,
 0.13934301476412786,
 0.3992282792085989)

# Random Forest

In [1683]:
features = [col for col in train_df.columns if 'Price' not in col]
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['LogSalePrice'], test_size = 0.3)

In [1414]:
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 100)

# Train the model on training data
rf.fit(X_train, y_train);

In [1415]:
# Make prediction
pred = model.predict(X_test)

In [1416]:
# Calculate RMSE
rmse = sqrt(mean_squared_error(pred, y_test))
rmse

0.1426827123928004

# Gradient Boosting

In [1286]:
features = [col for col in train_df.columns if 'Price' not in col]
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['LogSalePrice'], test_size = 0.3)

In [1287]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [1288]:
pred = model.predict(X_test)

In [1289]:
rmse = sqrt(mean_squared_error(pred, y_test))
rmse

0.12181982773299455

# Adaboost

In [1503]:
features = [col for col in train_df.columns if 'Price' not in col]
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['LogSalePrice'], test_size = 0.3)

649     11.344507
750     11.477298
620     11.112448
963     12.384219
1249    11.686879
Name: LogSalePrice, dtype: float64

In [1504]:
# label encoding
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)

In [1505]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [1697]:
pred = ada.predict(X_test)
pred = le.inverse_transform(pred)

In [1507]:
rmse = sqrt(mean_squared_error(pred, y_test))
rmse

0.266418123525164


# Ridge Regression

In [1155]:
features = [col for col in train_df.columns if 'Price' not in col]
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['LogSalePrice'], test_size = 0.3)

In [1156]:
## training the model
ridgeReg = Ridge(alpha=0.05, normalize=True)
ridgeReg.fit(X_train,y_train)

Ridge(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [1699]:
pred = ridgeReg.predict(X_test)

In [1158]:
# calculating rmse
rmse = sqrt(mean_squared_error(pred, y_test))
rmse

0.13627124703222407

# LASSO (??????咋这么大)

In [1146]:
features = [col for col in train_df.columns if 'Price' not in col]
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['LogSalePrice'], test_size = 0.3)

In [1159]:
# train the model
lassoReg = Lasso(alpha=0.03, normalize=True)
lassoReg.fit(X_train,y_train)

Lasso(alpha=0.03, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [1160]:
pred = lassoReg.predict(X_test)

In [1161]:
# calculating mse
rmse = sqrt(mean_squared_error(pred, y_test))
rmse

0.40681161971153335

# Correlation

In [1163]:
#Correlation with output variable
cor = train_df.corr()
cor_target = abs(cor["SalePrice"])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>=0.1]