# 8. XGBoost Regression

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
%matplotlib inline
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Process Normalized Data Without PCA

### Load train and test data

In [2]:
train_file = 'house-prices-advanced-regression-techniques/train_normalized.csv'
train_df = pd.read_csv(train_file)

test_file = 'house-prices-advanced-regression-techniques/test_normalized.csv'
test_df = pd.read_csv(test_file)

### Train model on the train data and predict the test data

In [3]:
X = train_df.drop(['SalePrice'], axis=1)
Y = train_df['SalePrice']
xg_reg = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X,Y)
Y_pred = xg_reg.predict(test_df)
print(Y_pred)
print(Y_pred.shape)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[ 88451.52   98591.34  118883.445 ...  99614.445  90337.02  159245.05 ]
(1459,)


### Save results

In [4]:
original_test_file = 'house-prices-advanced-regression-techniques/test.csv'
original_test_df = pd.read_csv(original_test_file)
id_col = original_test_df['Id']
result_df = pd.concat([id_col, pd.DataFrame(Y_pred, columns=['SalePrice'])], axis=1)
result_df.to_csv('XGBoost_reg_WO_PCA.csv', index=False)

## Improving the results by using Grid Search

### List of parameters and their possible changes as the input for Grid Search

In [5]:
# A parameter grid for XGBoost
params = {'min_child_weight':[4,5],
          'gamma':[i/10.0 for i in range(3,6)],
          'subsample':[i/10.0 for i in range(6,11)],
          'colsample_bytree':[i/10.0 for i in range(6,11)],
          'max_depth': [2,3,4,5,6],
         'learning_rate': [0.01, 0.05, 0.1, 0.15, 0,2],
          'n_estimators' :[1, 10, 50, 100],
         }

### Initialize XGB and GridSearch

In [6]:
X = train_df.drop(['SalePrice'], axis=1)
Y = train_df['SalePrice']


xgb = XGBRegressor(objective ='reg:squarederror')
grid = RandomizedSearchCV(xgb, params)
grid.fit(X, Y)
# grid.get_params
print(grid.best_params_)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

{'subsample': 1.0, 'n_estimators': 50, 'min_child_weight': 4, 'max_depth': 2, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 0.9}


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


### Train the XGBoost again, this time with the best hyper-parameters

In [8]:
xgb = XGBRegressor(objective ='reg:squarederror', **grid.best_params_)
xgb.fit(X,Y)
Y_pred = xg_reg.predict(test_df)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


### Save results

In [9]:
original_test_file = 'house-prices-advanced-regression-techniques/test.csv'
original_test_df = pd.read_csv(original_test_file)
id_col = original_test_df['Id']
result_df = pd.concat([id_col, pd.DataFrame(Y_pred, columns=['SalePrice'])], axis=1)
result_df.to_csv('XGBoost_rand_search_WO_PCA.csv', index=False)

## Process Normalized Data With PCA

### Load train and test data

In [None]:
train_file = 'house-prices-advanced-regression-techniques/train_dim_reduced.csv'
train_df = pd.read_csv(train_file)

test_file = 'house-prices-advanced-regression-techniques/test_dim_reduced.csv'
test_df = pd.read_csv(test_file)

### Train model on the train data and predict the test data

In [None]:
X = train_df.drop(['SalePrice'], axis=1)
Y = train_df['SalePrice']
xg_reg = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X,Y)
Y_pred = xg_reg.predict(test_df)
print(Y_pred)
print(Y_pred.shape)

### Save results

In [None]:
original_test_file = 'house-prices-advanced-regression-techniques/test.csv'
original_test_df = pd.read_csv(original_test_file)
id_col = original_test_df['Id']
result_df = pd.concat([id_col, pd.DataFrame(Y_pred, columns=['SalePrice'])], axis=1)
result_df.to_csv('XGBoost_reg_W_PCA.csv', index=False)