In [30]:
import pandas as pd
from sklearn.externals import joblib



In [26]:
train = pd.read_csv('cleaned/train_cleaned.csv')
train.shape

(1460, 289)

In [38]:
test = pd.read_csv('cleaned/test_cleaned.csv')
test.isnull().sum().sort_values(ascending=False)

SalePrice            1459
Utilities_NoSeWa        0
ExterCond_Fa            0
Electrical_FuseA        0
Electrical_FuseF        0
                     ... 
HouseStyle_1Story       0
HouseStyle_2.5Fin       0
HouseStyle_2.5Unf       0
HouseStyle_2Story       0
Unnamed: 0              0
Length: 289, dtype: int64

In [41]:
X_train = train.drop(['SalePrice'],axis=1)
y_train = train['SalePrice']
X_test = test.drop(['SalePrice'],axis=1)

In [46]:
X_train.shape, X_test.shape,y_train.shape

((1460, 288), (1459, 288), (1460,))

## Reduce demension

In [47]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.999)
X_train_reduced = pca.fit_transform(X_train)
joblib.dump(pca, 'pca.m') # save pca model as pca.m

['pca.m']

In [48]:
pd.DataFrame(X_train_reduced).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-2063.484568,-11.850026,-959.335404,-413.720529,777.011075,-184.270056,-170.188046,-65.515757
1,-909.986376,-41.030561,-1109.539086,603.697498,265.612501,-63.869386,-61.133917,56.613023
2,736.208148,56.746432,-940.855834,-600.340489,453.713194,-143.792195,-159.092521,-85.878619
3,-968.79144,-84.329569,-928.155742,-709.004391,221.702616,-113.472384,25.777169,-111.949616
4,3760.010238,608.337491,-936.418193,-665.664945,581.050123,-154.051555,-168.839906,-247.756239


In [49]:
pca2 = joblib.load('pca.m') # load pca.m as pca2
X_test_reduced = pca2.transform(X_test)

In [50]:
pd.DataFrame(X_test_reduced).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,1089.553429,-675.989026,996.186452,530.533355,-28.97945,-44.61511,-65.785559,-231.507261
1,3776.970129,-449.397832,895.299709,295.913817,1984.432094,12308.177344,-66.494861,-31.330123
2,3311.646357,-96.266522,1076.506361,38.341695,616.047273,-144.196432,-174.670564,57.336005
3,-541.209689,-21.504902,1070.143579,-57.196199,404.199484,-105.290585,-159.324536,40.791017
4,-5514.62748,164.593803,957.349968,367.951069,-662.878202,97.465284,-67.903864,22.465504


## Model selection

In [51]:
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost import XGBRegressor as XGBR

In [52]:
lr = LR()
lr_result = cross_val_score(lr,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [53]:
ridge = Ridge()
ridge_result = cross_val_score(ridge,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [54]:
lasso = Lasso()
lasso_result = cross_val_score(lasso,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [55]:
en = ElasticNet()
en_result = cross_val_score(en,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [56]:
svr = SVR()
svr_result = cross_val_score(svr,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [57]:
knr = KNR()
knr_result = cross_val_score(knr,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [58]:
rfr = RFR()
rfr_result = cross_val_score(rfr,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [59]:
gbr = GBR()
gbr_result = cross_val_score(gbr,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [60]:
xgbr = XGBR()
xgbr_result = cross_val_score(xgbr,X,y,cv=5,n_jobs=-1,scoring='neg_mean_squared_error').mean()

In [61]:
print("MSE of LR is: "+str(-lr_result))
print("MSE of Ridge is: "+str(-ridge_result))
print("MSE of Lasso is: "+str(-lasso_result))
print("MSE of EN is: "+str(-en_result))

print("MSE of SVR is: "+str(-svr_result))
print("MSE of KNR is: "+str(-knr_result))
print("MSE of RFR is: "+str(-rfr_result))
print("MSE of GBR is: "+str(-gbr_result))
print("MSE of XBR is: "+str(-xgbr_result))

MSE of LR is: 5.33041267207878e+18
MSE of Ridge is: 1081140399.4743972
MSE of Lasso is: 1187235383.1210105
MSE of EN is: 1257138303.7288868
MSE of SVR is: 6624638773.308963
MSE of KNR is: 2470967567.3733153
MSE of RFR is: 1001461145.0936712
MSE of GBR is: 731665598.4530817
MSE of XBR is: 731514496.735886


## Train the best GBR model

In [23]:
params_GBR = {'learning_rate': [0.01, 0.02, 0.03],
             'subsample': [0.9, 0.5, 0.2],
             'n_estimators': [100, 500, 1000],
             'max_depth': [4, 6, 8]
             }
gbr = GBR()
clf = RandomizedSearchCV(gbr, param_distributions=params_GBR, cv=3, n_iter=10)
clf.fit(X, y)
clf.best_params_

{'subsample': 0.5, 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.01}

In [None]:
clf.cv_results_

In [None]:
gbr_best = GBR(subsample=0.5, n_estimators=500, max_depth=8, learning_rate=0.02)
gbr_best.fit(X,y) # use all training set to fit the model
# use test_cleaned to test