In [168]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")
%matplotlib inline



In [169]:
def readdata():
    train = pd.read_csv("train.csv")
    print('Shape of train: {}'.format(train.shape))
    test = pd.read_csv("test.csv")
    print('Shape of test: {}'.format(test.shape))
    return train, test

In [170]:
train, test = readdata()

Shape of train: (1460, 81)
Shape of test: (1459, 80)


In [171]:
def preparedata():
    train, test = readdata()
    print("Preparing data....")
    print("Log-transforming target....")
    train["SalePrice"] = np.log1p(train["SalePrice"])
    
    print("Combining datasets...")
    trainrow = train.shape[0]
    testrow = test.shape[0]
    
    train_ID = train['Id']
    test_ID = test['Id']
    train.drop('Id', axis=1, inplace=True)
    test.drop('Id', axis = 1, inplace = True)
    
    print("Saving target...")
    target = train.SalePrice.values
    
    all_data = pd.concat((train,test)).reset_index(drop=True)
    all_data.drop(['SalePrice'], axis=1, inplace=True)
    
    print("Combined datasize is : {}".format(all_data.shape))
    
    print("Filling Categorical NA's...")
    for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'SaleType','MiscFeature', 'Alley',
            'BsmtExposure', 'BsmtCond','BsmtFinType2', 'BsmtFinType1', 'MasVnrType','MSZoning', 'PoolQC', 'Fence', 'FireplaceQu'):
        all_data[col] = all_data[col].fillna('Unknown')
        
    print("Filling Numerical NA's...")
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'BsmtFinSF1',
           'BsmtFinSF1', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2'):
            all_data[col] = all_data[col].fillna(0)
    
    print("Imputing with median...")
    all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    print("Imputing with mode...")
    all_data["Functional"] = all_data["Functional"].fillna("Typ")
    all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
    all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
    all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
    all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
    
    print("Dropping features...")
    all_data = all_data.drop(['Utilities'], axis=1)
    
    print("Labelencoding Categorical Features...")
    catcols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
    for c in catcols:
        lbl = LabelEncoder() 
        lbl.fit(list(all_data[c].values)) 
        all_data[c] = lbl.transform(list(all_data[c].values))
        
    print("One-hot Encoding Categorical Variables...")
    all_data = pd.get_dummies(all_data)
        
        
    print('Final shape of dataset: {}'.format(all_data.shape))
    print("Splitting dataset and returning train, test and target...")
    train = all_data[:trainrow] 
    test = all_data[trainrow:]
    
    return train, test, target, test_ID
    


In [172]:
train, test, target, test_ID = preparedata()

Shape of train: (1460, 81)
Shape of test: (1459, 80)
Preparing data....
Log-transforming target....
Combining datasets...
Saving target...
Combined datasize is : (2919, 79)
Filling Categorical NA's...
Filling Numerical NA's...
Imputing with median...
Imputing with mode...
Dropping features...
Labelencoding Categorical Features...
One-hot Encoding Categorical Variables...
Final shape of dataset: (2919, 223)
Splitting dataset and returning train, test and target...


In [173]:
train.shape

(1460, 223)

In [174]:
test.shape

(1459, 223)

In [175]:
target.shape

(1460,)

In [188]:
def BayesXGB():
    print("Preparing dataset for Bayesian optimization of XGBoost hyperparameters...")
    X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20)
    dtrain =xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test)
    
    print("Performing Bayesian Optimization...")
    xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth'       : (3,15),
                                                 'gamma'           : (0,5),
                                                'colsample_bytree' : (0.3, 0.9),
                                                'min_child_weight' : (0,25),
                                                 'subsample'       : (0.5, 1),
                                                 'alpha'           : (0, 5)
                                            })
    xgb_bo.maximize(init_points=2, n_iter=5, acq = 'ei')
    
    print(xgb_bo.res['max'])
    params = (xgb_bo.res['max']['max_params'])
    params['max_depth'] = int(params['max_depth'])
    params['min_child_weight'] = int(params['min_child_weight'])
    return params

In [176]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20)
dtrain =xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [177]:
def xgb_evaluate(max_depth, gamma, colsample_bytree, min_child_weight, subsample, alpha):
    params = {
        'eval_metric' : 'rmse',
        'max_depth'   : int(max_depth),
        'subsample'   : max(min(subsample,1),0) ,
        'eta'         : 0.1 ,
        'gamma'       : max(gamma,0),
        'alpha'       : max(alpha, 0),
        'colsample_bytree' : max(min(colsample_bytree,1),0),
        'min_child_weight' : int(min_child_weight)
    }
    cv_result = xgb.cv(params, dtrain, num_boost_round = 1000, early_stopping_rounds = 100, nfold=5)
    #BayesOptimization kan kun maximere og ikke minimere, derfor skal vi gøre RMSE negativt
    return -1 * cv_result['test-rmse-mean'].iloc[-1]
    

In [178]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth'       : (3,15),
                                             'gamma'           : (0,5),
                                            'colsample_bytree' : (0.3, 0.9),
                                            'min_child_weight' : (0,25),
                                             'subsample'       : (0.5, 1),
                                             'alpha'           : (0, 5)
                                            })

In [None]:
params = BayesXGB()

Preparing dataset for Bayesian optimization of XGBoost hyperparameters...
Performing Bayesian Optimization...
[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
    1 | 00m12s | [35m  -0.18731[0m | [32m   0.3266[0m | [32m            0.6059[0m | [32m   2.5303[0m | [32m     6.5262[0m | [32m           16.4276[0m | [32m     0.9954[0m | 
    2 | 00m17s |   -0.20663 |    4.3918 |             0.7762 |    3.3778 |     14.8957 |            12.1164 |      0.6624 | 


In [179]:
xgb_bo.maximize(init_points=10, n_iter=50, acq = 'ei')

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
    1 | 00m58s | [35m  -0.18050[0m | [32m   1.9113[0m | [32m            0.4778[0m | [32m   1.9718[0m | [32m    14.4089[0m | [32m            1.7665[0m | [32m     0.7602[0m | 
    2 | 00m14s |   -0.21009 |    1.6834 |             0.3393 |    4.6990 |      6.6876 |             2.1434 |      0.8696 | 
    3 | 00m27s | [35m  -0.17921[0m | [32m   3.0821[0m | [32m            0.3889[0m | [32m   1.7865[0m | [32m    14.8938[0m | [32m           18.6927[0m | [32m     0.8969[0m | 
    4 | 00m13s |   -0.18707 |    2.8926 |             0.5000 |    2.1073 |      7.0058 |            20.1832 |      0.6816 | 
    5 | 00m17s |   -0.20311 |    0.2802 |             0.6106 |    4.4314 |     14.8441 |    

   59 | 00m12s |   -0.12802 |    0.3346 |             0.3094 |    0.0041 |      3.0853 |             1.9297 |      0.7528 | 
   60 | 00m08s |   -0.12874 |    0.1203 |             0.3495 |    0.0217 |      3.0514 |            11.2338 |      0.8327 | 


In [180]:
xgb_bo.res['max']

{'max_val': -0.12594660000000002,
 'max_params': {'max_depth': 3.0453557484746963,
  'gamma': 0.0,
  'colsample_bytree': 0.3009097674964732,
  'min_child_weight': 15.0997079744375,
  'subsample': 0.5000000001922376,
  'alpha': 1.3950466145869655e-10}}

In [181]:
params = (xgb_bo.res['max']['max_params'])

In [182]:
params['max_depth'] = int(params['max_depth'])
params['min_child_weight'] = int(params['min_child_weight'])

In [183]:
params

{'max_depth': 3,
 'gamma': 0.0,
 'colsample_bytree': 0.3009097674964732,
 'min_child_weight': 15,
 'subsample': 0.5000000001922376,
 'alpha': 1.3950466145869655e-10}

In [None]:
def trainxgb(params):
    n_iters = 5
    xgb_preds = []
    
    for i in range(n_iters):
        X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20, random_state = i)
    
        dtrain = xgb.DMatrix(X_train, label = y_train)
        dvalid = xgb.DMatrix(X_test, label = y_test)
        testxgb   = xgb.DMatrix(test)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    
        xgb_model = xgb.train(params, dtrain, 2000, watchlist, early_stopping_rounds = 150, verbose_eval = 200)
        preds = xgb_model.predict(testxgb)
        preds = np.exp(preds) - 1
        xgb_preds.append(preds)
        
    predictions = pd.DataFrame(list(zip(np.mean(xgb_preds, axis=0))), columns=['xgbpreds'])
    
    return predictions
    

In [184]:
n_iters = 5
xgb_preds = []
for i in range(n_iters):
    X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20, random_state = i)
    
    dtrain = xgb.DMatrix(X_train, label = y_train)
    dvalid = xgb.DMatrix(X_test, label = y_test)
    testxgb   = xgb.DMatrix(test)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    
    xgb_model = xgb.train(params, dtrain, 2000, watchlist, early_stopping_rounds = 150, verbose_eval = 200)
    preds = xgb_model.predict(testxgb)
    preds = np.exp(preds) - 1
    xgb_preds.append(preds)

[0]	train-rmse:8.07528	valid-rmse:8.07813
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 150 rounds.
[200]	train-rmse:0.067258	valid-rmse:0.149589
[400]	train-rmse:0.043806	valid-rmse:0.146474
[600]	train-rmse:0.030539	valid-rmse:0.146779
Stopping. Best iteration:
[520]	train-rmse:0.035225	valid-rmse:0.145243

[0]	train-rmse:8.09094	valid-rmse:8.04411
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 150 rounds.
[200]	train-rmse:0.06344	valid-rmse:0.145283
Stopping. Best iteration:
[53]	train-rmse:0.105892	valid-rmse:0.138679

[0]	train-rmse:8.08045	valid-rmse:8.10217
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 150 rounds.
[200]	train-rmse:0.067126	valid-rmse:0.13787
Stopping. Best iteration:
[137]	train-rmse:0.079739	valid-rmse:

In [185]:
predictions = pd.DataFrame(list(zip(np.mean(xgb_preds, axis=0))),
              columns=['xgbpreds'])

In [186]:
predictions.head()

Unnamed: 0,xgbpreds
0,118745.140625
1,156308.4375
2,186033.015625
3,194516.6875
4,187035.828125


In [187]:
submission = pd.DataFrame()
submission['Id'] = test_ID
submission["SalePrice"] = predictions['xgbpreds']
submission.head()
submission.to_csv("submission.csv", index=False)

In [None]:
def submission(pred):
    submission = pd.Dataframe()
    submission['Id'] = test_ID
    submission['SalePrice'] = pred['xgbpreds']
    submission.to_csv('submission.csv', index=False)