In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

#Initial script

In [149]:
def readdata():
    train = pd.read_csv("train.csv")
    print('Shape of train: {}'.format(train.shape))
    test = pd.read_csv("test.csv")
    print('Shape of test: {}'.format(test.shape))
    return train, test

In [150]:
def preparedata():
    train, test = readdata()
    print("Preparing data....")
    print("Log-transforming target....")
    train["SalePrice"] = np.log1p(train["SalePrice"])
    
    print("Combining datasets...")
    trainrow = train.shape[0]
    testrow = test.shape[0]
    
    train_ID = train['Id']
    test_ID = test['Id']
    train.drop('Id', axis=1, inplace=True)
    test.drop('Id', axis = 1, inplace = True)
    
    print("Saving target...")
    target = train.SalePrice.values
    
    all_data = pd.concat((train,test)).reset_index(drop=True)
    all_data.drop(['SalePrice'], axis=1, inplace=True)
    
    print("Combined datasize is : {}".format(all_data.shape))
    
    print("Filling Categorical NA's...")
    for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'SaleType','MiscFeature', 'Alley',
            'BsmtExposure', 'BsmtCond','BsmtFinType2', 'BsmtFinType1', 'MasVnrType','MSZoning', 'PoolQC', 'Fence', 'FireplaceQu'):
        all_data[col] = all_data[col].fillna('Unknown')
        
    print("Filling Numerical NA's...")
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'BsmtFinSF1',
           'BsmtFinSF1', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2'):
            all_data[col] = all_data[col].fillna(0)
    
    print("Imputing with median...")
    all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    print("Imputing with mode...")
    all_data["Functional"] = all_data["Functional"].fillna("Typ")
    all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
    all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
    all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
    all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
    
    print("Dropping features...")
    all_data = all_data.drop(['Utilities'], axis=1)
    
    print("Labelencoding Categorical Features...")
    catcols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
    for c in catcols:
        lbl = LabelEncoder() 
        lbl.fit(list(all_data[c].values)) 
        all_data[c] = lbl.transform(list(all_data[c].values))
        
    print("One-hot Encoding Categorical Variables...")
    all_data = pd.get_dummies(all_data)
        
        
    print('Final shape of dataset: {}'.format(all_data.shape))
    print("Splitting dataset and returning train, test and target...")
    train = all_data[:trainrow] 
    test = all_data[trainrow:]
    
    return train, test, target, test_ID
    


In [161]:
def BayesXGB():
    print("Performing Bayesian Optimization on XGB...")
    xgb_bo = BayesianOptimization(xgb_evaluate,{'max_depth'       : (3,15),
                                                 'gamma'           : (0,5),
                                                'colsample_bytree' : (0.3, 0.9),
                                                'min_child_weight' : (0,25),
                                                 'subsample'       : (0.5, 1),
                                                 'alpha'           : (0, 5)
                                            })
    xgb_bo.maximize(init_points=10, n_iter=50, acq = 'ei')
    print("Identified optimal hyperparameters...")
    print('Maximum value obtained: {}'.format(xgb_bo.res['max']['max_val']))
    print(xgb_bo.res['max']['max_params'])
    params = (xgb_bo.res['max']['max_params'])
    params['max_depth'] = int(params['max_depth'])
    params['min_child_weight'] = int(params['min_child_weight'])
    params['silent'] = 1
    params['eta'] = 0.01
    return params

In [162]:
def xgb_evaluate(max_depth, gamma, colsample_bytree, min_child_weight, subsample, alpha):
    dtrain =xgb.DMatrix(train, label=target)
    params = {
        'eval_metric' : 'rmse',
        'max_depth'   : int(max_depth),
        'subsample'   : max(min(subsample,1),0) ,
        'eta'         : 0.01 ,
        'gamma'       : max(gamma,0),
        'alpha'       : max(alpha, 0),
        'colsample_bytree' : max(min(colsample_bytree,1),0),
        'min_child_weight' : int(min_child_weight),
        'silent' : 1
    }
    cv_result = xgb.cv(params, dtrain, num_boost_round = 2000, early_stopping_rounds = 100, nfold=5)
    #BayesOptimization kan kun maximere og ikke minimere, derfor skal vi gøre RMSE negativt
    return -1 * cv_result['test-rmse-mean'].iloc[-1]
    

In [166]:
def trainxgb(params):
    print("Training XGBoost with found parameters...")
    n_iters = 5
    xgb_preds = []
    
    for i in range(n_iters):
        X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20, random_state = i)
    
        dtrain = xgb.DMatrix(X_train, label = y_train)
        dvalid = xgb.DMatrix(X_test, label = y_test)
        testxgb   = xgb.DMatrix(test)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    
        xgb_model = xgb.train(params, dtrain, 5000, watchlist, early_stopping_rounds = 150, verbose_eval = 500)
        preds = xgb_model.predict(testxgb)
        preds = np.exp(preds) - 1
        xgb_preds.append(preds)
        
    #predictions = pd.DataFrame(list(zip(np.mean(xgb_preds, axis=0))), columns=['xgbpreds'])
    print("Finished training and predicting...")
    return np.mean(xgb_preds, axis=0)

In [154]:
def submission(pred):
    submissions = pd.DataFrame()
    submissions['Id'] = test_ID
    submissions['SalePrice'] = pred['xgbpreds']
    submissions.to_csv('submission.csv', index=False)

In [230]:
def lgb_evaluate(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    dtrain =lgb.Dataset(train, label=target)
    params = {'application':'regression_l2','num_iterations':2000, 'learning_rate':0.01, 'early_stopping_round':100, 'metric':'rmse', 'silent':1}
    params["num_leaves"] = int(num_leaves)
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = int(max_depth)
    params['lambda_l1'] = max(lambda_l1, 0)
    params['lambda_l2'] = max(lambda_l2, 0)
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    cv_result = lgb.cv(params, dtrain, nfold=5, stratified=False, metrics=['rmse'])
    return -1 * cv_result['rmse-mean'][-1]

In [184]:
def lgbBayes():
    print("Performing Bayesian Optimization on LGB...")
    lgb_bo = BayesianOptimization(lgb_evaluate, {'num_leaves' : (3,25),
                                                 'feature_fraction' : (0.1, 0.9),
                                                 'bagging_fraction' : (0.1, 0.9),
                                                 'max_depth'        : (3, 15),
                                                 'lambda_l1'        : (0, 5),
                                                 'lambda_l2'        : (0, 3),
                                                 'min_split_gain'   : (0.001, 0.1),
                                                 'min_child_weight' : (1, 25)    
                                                })
    lgb_bo.maximize(init_points=10, n_iter=50, acq ='ei')
    print("Identified optimal hyperparameters...")
    print("Maximum value obtained: {}".format(lgb_bo.res['max']['max_val']))
    print(lgb_bo.res['max']['max_params'])
    params = (lgb_bo.res['max']['max_params'])
    params['num_leaves'] = int(params['num_leaves']) 
    params['min_child_weight'] = int(params['min_child_weight'])
    params['max_depth'] = int(params['max_depth']) 
    params['metric'] = 'rmse'
    params['learning_rate'] = 0.01
    return params

In [203]:
def trainlgb(params):
    print("Training LightGBM with found parameters...")
    n_iters = 5
    lgb_preds = []
    
    for i in range(n_iters):
        X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20, random_state = i)
    
        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_test, label=y_test, reference=dtrain)
            
        lgb_model = lgb.train(params, dtrain, 5000, valid_sets=dvalid, early_stopping_rounds = 150, verbose_eval = 500)
        preds = lgb_model.predict(test)
        preds = np.exp(preds) - 1
        lgb_preds.append(preds)
        
    #predictions = pd.DataFrame(list(zip(np.mean(lgb_preds, axis=0))), columns=['lgbpreds'])
    print("Finished training and predicting...")
    return np.mean(lgb_preds, axis=0)

In [221]:
def combinepreds(lgb_preds, xgb_preds):
    preds = pd.DataFrame(np.column_stack([lgb_preds, xgb_preds]), 
                               columns=['LGB Preds', 'XGB Preds'])
    preds['mean'] = preds.mean(axis=1)
    return preds

In [226]:
def submission(lgb_preds, xgb_preds):
    pred = combinepreds(lgb_preds, xgb_preds)
    submissions = pd.DataFrame()
    submissions['Id'] = test_ID
    submissions['SalePrice'] = pred['mean']
    submissions.to_csv('submission.csv', index=False)

In [None]:
train, test, target, test_ID = preparedata()

In [None]:
params = BayesXGB()
xgbpreds = trainxgb(params)

In [None]:
params = lgbBayes()
lgbpreds = trainlgb(params)

In [None]:
submission(lgbpreds, xgbpreds)

In [199]:
train, test, target, test_ID = preparedata()

Shape of train: (1460, 81)
Shape of test: (1459, 80)
Preparing data....
Log-transforming target....
Combining datasets...
Saving target...
Combined datasize is : (2919, 79)
Filling Categorical NA's...
Filling Numerical NA's...
Imputing with median...
Imputing with mode...
Dropping features...
Labelencoding Categorical Features...
One-hot Encoding Categorical Variables...
Final shape of dataset: (2919, 223)
Splitting dataset and returning train, test and target...


In [164]:
params = BayesXGB()

Performing Bayesian Optimization on XGB...
[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
    1 | 00m58s | [35m  -0.16914[0m | [32m   1.8731[0m | [32m            0.4734[0m | [32m   1.6143[0m | [32m     6.9588[0m | [32m           19.8711[0m | [32m     0.6138[0m | 
    2 | 00m45s |   -0.19394 |    4.1035 |             0.4574 |    3.6991 |      7.1551 |             6.7040 |      0.9706 | 
    3 | 01m14s | [35m  -0.16686[0m | [32m   2.3092[0m | [32m            0.5242[0m | [32m   1.6011[0m | [32m     7.3190[0m | [32m           13.9814[0m | [32m     0.8360[0m | 
    4 | 01m39s |   -0.18881 |    2.4176 |             0.8304 |    3.4421 |     11.8612 |             5.1439 |      0.9308 | 
    5 | 00m51s |   -0.18837 |    2.7026 |        

   58 | 00m49s |   -0.12544 |    0.2156 |             0.8553 |    0.0329 |      3.0088 |            17.6853 |      0.9953 | 
   59 | 01m31s |   -0.12214 |    0.0113 |             0.3753 |    0.0160 |      8.8144 |            16.9244 |      0.8954 | 
   60 | 00m54s |   -0.12018 |    0.4105 |             0.3337 |    0.0045 |      6.5792 |             2.2545 |      0.5691 | 
Identified optimal hyperparameters...
Maximum value obtained: -0.1195046
{'max_depth': 8.27734097391035, 'gamma': 0.026024100201859635, 'colsample_bytree': 0.4500518255799414, 'min_child_weight': 0.39972572149608865, 'subsample': 0.5002013845312789, 'alpha': 0.006207801936168877}


In [168]:
xgbpreds = trainxgb(params)

Training XGBoost with found parameters...
[0]	train-rmse:11.4152	valid-rmse:11.4182
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 150 rounds.
[500]	train-rmse:0.122318	valid-rmse:0.159717
[1000]	train-rmse:0.058639	valid-rmse:0.123772
[1500]	train-rmse:0.053118	valid-rmse:0.122553
Stopping. Best iteration:
[1842]	train-rmse:0.051636	valid-rmse:0.122243

[0]	train-rmse:11.4254	valid-rmse:11.3782
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 150 rounds.
[500]	train-rmse:0.123251	valid-rmse:0.158942
[1000]	train-rmse:0.058032	valid-rmse:0.121575
[1500]	train-rmse:0.052576	valid-rmse:0.12016
[2000]	train-rmse:0.050404	valid-rmse:0.119933
Stopping. Best iteration:
[2283]	train-rmse:0.04954	valid-rmse:0.119712

[0]	train-rmse:11.4118	valid-rmse:11.4333
Multiple eval metrics have been passed: 'valid-rmse' will be used f

In [185]:
params = lgbBayes()

Performing Bayesian Optimization on LGB...
[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   bagging_fraction |   feature_fraction |   lambda_l1 |   lambda_l2 |   max_depth |   min_child_weight |   min_split_gain |   num_leaves | 
    1 | 00m02s | [35m  -0.14820[0m | [32m            0.8176[0m | [32m            0.6552[0m | [32m     4.3148[0m | [32m     2.8594[0m | [32m     7.1991[0m | [32m           16.5535[0m | [32m          0.0886[0m | [32m      8.7493[0m | 
    2 | 00m02s | [35m  -0.14557[0m | [32m            0.2087[0m | [32m            0.1333[0m | [32m     4.2770[0m | [32m     0.2219[0m | [32m     7.9322[0m | [32m           13.1032[0m | [32m          0.0395[0m | [32m     17.1749[0m | 
    3 | 00m03s | [35m  -0.14112[0m | [32m            0.3054[0m | [32m         

   42 | 00m19s |   -0.12572 |             0.5248 |             0.2173 |      0.1155 |      1.4602 |     14.9314 |             9.7043 |           0.0273 |      24.4330 | 
   43 | 00m20s |   -0.12630 |             0.4075 |             0.1046 |      0.1544 |      2.3286 |      4.0143 |             1.0172 |           0.0041 |      11.7804 | 
   44 | 00m30s |   -0.12375 |             0.1490 |             0.1667 |      0.0269 |      0.3290 |     14.5353 |             6.6296 |           0.0052 |      22.5266 | 
   45 | 00m18s |   -0.12852 |             0.3073 |             0.1044 |      0.0761 |      0.0111 |      3.3783 |             3.9546 |           0.0501 |      17.5399 | 
   46 | 00m29s |   -0.12668 |             0.2499 |             0.8034 |      0.0054 |      1.7909 |      8.6878 |            10.4835 |           0.0031 |      18.4304 | 
   47 | 00m30s |   -0.12366 |             0.5710 |             0.2162 |      0.0216 |      0.2683 |     10.1743 |             1.9812 |           0.010

In [204]:
lgbpreds = trainlgb(params)

Training LightGBM with found parameters...
Training until validation scores don't improve for 150 rounds.
[500]	valid_0's rmse: 0.134223
[1000]	valid_0's rmse: 0.130256
Early stopping, best iteration is:
[978]	valid_0's rmse: 0.130229
Training until validation scores don't improve for 150 rounds.
[500]	valid_0's rmse: 0.130828
[1000]	valid_0's rmse: 0.12613
Early stopping, best iteration is:
[1039]	valid_0's rmse: 0.126091
Training until validation scores don't improve for 150 rounds.
[500]	valid_0's rmse: 0.134153
[1000]	valid_0's rmse: 0.129058
Early stopping, best iteration is:
[1098]	valid_0's rmse: 0.129017
Training until validation scores don't improve for 150 rounds.
[500]	valid_0's rmse: 0.121448
[1000]	valid_0's rmse: 0.118543
Early stopping, best iteration is:
[909]	valid_0's rmse: 0.118413
Training until validation scores don't improve for 150 rounds.
[500]	valid_0's rmse: 0.113089
[1000]	valid_0's rmse: 0.107368
Early stopping, best iteration is:
[1147]	valid_0's rmse: 0.10