In [None]:
import pandas as pd
import numpy as np
import warnings

# deal with potential warnings
warnings.filterwarnings(action='ignore')

In [None]:
# read data
x_train = pd.read_csv('final_train.csv')
y = pd.read_csv('final_price.csv')
x_test = pd.read_csv('final_test.csv')
test_id = pd.read_csv('test.csv').Id

# prepare for stacking
price_train = pd.DataFrame()
price_test = pd.DataFrame()

In [None]:
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

### Lasso

In [None]:
param_grid = {'alpha':np.arange(0, 0.0006, 0.0001)}
model_Lasso = Lasso()
Lasso_clf = GridSearchCV(model_Lasso, param_grid, cv=5, scoring='neg_mean_squared_log_error')
Lasso_clf.fit(x_train, y)

In [None]:
print('Best alpha for Lasso regression is:', Lasso_clf.best_estimator_.alpha)
print('Log score for best Lasso model is:', np.sqrt(-Lasso_clf.best_score_))

In [None]:
# produce price feature
y_pre1 = Lasso_clf.best_estimator_.predict(x_train).reshape(len(x_train), )
y_pre2 = Lasso_clf.best_estimator_.predict(x_test).reshape(len(x_test), )
price_train['Lasso'] = y_pre1
price_test['Lasso'] = y_pre2

### ElasticNet

In [None]:
model_ENet = ElasticNet()
param_grid = {'alpha':np.arange(0, 0.001, 0.0001)}
model_ENet = Lasso()
Enet_clf = GridSearchCV(model_ENet, param_grid, cv=5, scoring='neg_mean_squared_log_error')
Enet_clf.fit(x_train, y)

In [None]:
print('Best alpha for Enet regression is:', Enet_clf.best_estimator_.alpha)
print('Log score for best Enet model is:', np.sqrt(-Enet_clf.best_score_))

In [None]:
# produce price feature
y_pre1 = Enet_clf.best_estimator_.predict(x_train).reshape(len(x_train), )
y_pre2 = Enet_clf.best_estimator_.predict(x_test).reshape(len(x_test), )
price_train['Enet'] = y_pre1
price_test['Enet'] = y_pre2

### xgb

Pick n_estimators at first 

In [None]:
xgb = xgb.XGBRegressor(seed=1885)

param_test0 = {'n_estimators':np.arange(200, 400, 10),
               'learning_rate':[0.1]}

xgb0_clf = GridSearchCV(xgb, param_test0, cv=5, scoring='neg_mean_squared_log_error')
xgb0_clf.fit(x_train, y)

print('Best hyperparameter for this step:', xgb0_clf.best_params_)
print('log score for this step is:', np.sqrt(-xgb0_clf.best_score_))

Tune max_depth and min_child_weight

In [None]:
param_test1 = {'n_estimators':[330],
               'max_depth':range(1, 10, 1),
               'min_child_weight':range(1,10),
               'learning_rate':[0.1]}

xgb1_clf = GridSearchCV(xgb, param_test1, cv=5, scoring='neg_mean_squared_log_error')
xgb1_clf.fit(x_train, y)

print('Best hyperparameter for this step:', xgb1_clf.best_params_)
print('log score for this step is:', np.sqrt(xgb1_clf.best_score_))

Tune gamma

In [None]:
param_test2 = {'n_estimators':[330],
               'gamma':np.arange(0, 1, 0.2),
               'max_depth':[5],
               'min_child_weight':[4],
               'learning_rate':[0.1]}
xgb2_clf = GridSearchCV(xgb, param_test2, cv=5, scoring='neg_mean_squared_log_error')
xgb2_clf.fit(x_train, y)

print('Best hyperparameter for this step:', xgb2_clf.best_params_)
print('log score for this step is:', np.sqrt(xgb2_clf.best_score_))

Tune subsample and colsample_bytree

In [None]:
param_test3 = {'subsample':[i/10.0 for i in range(1,10)],
               'colsample_bytree':[i/10.0 for i in range(1,10)],
               'gamma':[0],
               'max_depth':[5],
               'min_child_weight':[4],
               'n_estimators':[330],
               'learning_rate':[0.1]}
xgb3_clf = GridSearchCV(xgb, param_test3, cv=5, scoring='neg_mean_squared_log_error')
xgb3_clf.fit(x_train, y)

print('Best hyperparameter for this step:', xgb3_clf.best_params_)
print('R2 score for this step is:', np.sqrt(xgb3_clf.best_score_))

Tuning Regularization Parameters

In [None]:
param_test4 = {'reg_alpha':[0.1, 0.01, 0, 1, 100],
               'subsample':[0.8],
               'colsample_bytree':[0.2],
               'gamma':[0],
               'max_depth':[5],
               'min_child_weight':[4],
               'n_estimators':[330],
               'learning_rate':[0.1]}
xgb4_clf = GridSearchCV(xgb, param_test4, cv=5, scoring='neg_mean_squared_log_error')
xgb4_clf.fit(x_train, y)

print('Best hyperparameter for this step:', xgb4_clf.best_params_)
print('R2 score for this step is:', np.sqrt(xgb4_clf.best_score_))

Reduce Learning Rate and add more trees

In [None]:
param_test5 = {'subsample':[0.8],
               'reg_alpha':[0],
               'colsample_bytree':[0.2],
               'gamma':[0],
               'max_depth':[5],
               'min_child_weight':[4],
               'n_estimators':[700],
               'learning_rate':[0.05]}
xgb5_clf = GridSearchCV(xgb, param_test5, cv=5, scoring='neg_mean_squared_log_error')
xgb5_clf.fit(x_train, y)

print('Best hyperparameter for xgboost:', xgb5_clf.best_estimator_)
print('Log score for best xgboost model:', np.sqrt(-xgb5_clf.best_score_))

In [None]:
# produce price feature
y_pre1 = xgb5_clf.best_estimator_.predict(x_train).reshape(len(x_train), )
y_pre2 = xgb5_clf.best_estimator_.predict(x_test).reshape(len(x_test), )
price_train['xgb'] = y_pre1
price_test['xgb'] = y_pre2

### stack by Linear combination

In [None]:
preds = 0.7*price_test['Lasso'] + 0.*price_test['Enet'] + 0.3*price_test['xgb']
preds = np.expm1(preds)

### submission file

In [None]:
solution = pd.DataFrame({"id":test_id, "SalePrice":preds})
solution.to_csv("final_submission.csv", index = False)