# XGBoost

In [7]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split

X = pd.read_csv('Data/train.csv', index_col='Id')
X_test_full = pd.read_csv('Data/test.csv', index_col ='Id')

X.dropna(axis = 0, subset=['SalePrice'], inplace = True)
y = X.SalePrice
X.drop(['SalePrice'], axis = 1, inplace = True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size=0.2, random_state = 0)

#categorical columns with low cardinality
cat_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == 'object' and X_train_full[cname].nunique() < 10]

#numerical columns
num_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['float64', 'int64']]

train_cols = num_cols + cat_cols
X_train = X_train_full[train_cols].copy()
X_valid = X_valid_full[train_cols].copy()
X_test = X_test_full[train_cols].copy()

#One-hot encoding
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join = 'left', axis = 1)
X_train, X_test = X_train.align(X_test, join = 'left', axis = 1)

In [8]:
from xgboost import XGBRegressor

xgbmodel = XGBRegressor(random_state=0)

xgbmodel.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [9]:
from sklearn.metrics import mean_absolute_error

predictions = xgbmodel.predict(X_valid)

print('MAE: ', mean_absolute_error(predictions, y_valid))

MAE:  17662.736729452055


## Improving the model

In [None]:
improved_model = XGBRegressor(n_estimators = 200, learning_rate = 0.1)