# XGBoost Tutorial

In this notebook, we provide an introductory XGBoost tutorial, using urban heat island data from a previous paper.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
from xgboost import XGBRegressor
import shap
# import hyperparameter tuning library, if necessary

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
features = ['Black / African American', 'Hispanic / Latino', 'White', 'Below Federal Poverty Line', 
            'Population Density', 'Impervious Surface', 'DelNDVI_summer', 'Spatial Lag']
label = 'UHI_summer_day'

In [3]:
data = pd.read_csv('data/uhi.csv', usecols=features+[label])

In [4]:
pd.DataFrame({'Total Missing': data.isna().sum(), 
              'Percent Missing': (data.isna().sum() / len(data)) * 100 })

Unnamed: 0,Total Missing,Percent Missing
UHI_summer_day,222,0.397344
DelNDVI_summer,18,0.032217
Black / African American,335,0.599595
White,335,0.599595
Hispanic / Latino,335,0.599595
Below Federal Poverty Line,441,0.789318
Impervious Surface,1187,2.124537
Population Density,9,0.016109
Spatial Lag,0,0.0


In [5]:
# We can drop the missing values or interpolate
data = data.interpolate(method='nearest')

In [7]:
# Separate the features and target
X, y = data[features], data[['UHI_summer_day']]

# Split the data into training/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Model Development

In [8]:
# Train a model using the scikit-learn API
model = XGBRegressor(n_estimators=100, objective='reg:squarederror', tree_method='hist', enable_categorical=True)
model.fit(X_train, y_train)

In [9]:
# Make predictions with the test set
y_pred = model.predict(X_test)

In [10]:
# Evaluate the RMSE and R^2 with the test set
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

In [11]:
rmse, r2

(1.1518746453559172, 0.82674408553826)