# XGBoost

In [2]:
pip install --user xgboost

Note: you may need to restart the kernel to use updated packages.




In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
dataset = sns.load_dataset("diamonds")
dataset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [18]:
dataset.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [19]:
dataset.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


This code uses the describe() method on the diamonds DataFrame to generate a summary of its non-numeric columns.

• The exclude parameter is set to np.number, which is a NumPy data type that includes all numeric types.

• By setting exclude to np.number, the describe() method will only generate a summary for the non-numeric columns in the DataFrame.

• The output will include the count, unique values, top value, and frequency of the top value for each non-numeric column.

In [20]:
from sklearn.model_selection import train_test_split
x,y= dataset.drop("price", axis=1) , dataset[['price']]

This code snippet is used to extract text features from a dataset and convert them to Pandas category data type.

• First, the select_dtypes() method is used on the dataset X to select columns that do not contain numerical data.

• The exclude parameter is set to np.number to exclude columns with numerical data.

• The resulting columns are stored in the cats variable as a list.

• Next, a loop is used to iterate through each column in the cats list.

• For each column, the astype() method is used to convert the data type to a Pandas category.

• This is done to reduce memory usage and improve performance when working with categorical data.

In [21]:
x.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [22]:
# Extract text features
cats = x.select_dtypes(exclude=np.number).columns.tolist()

for cols in cats:
    x[cols] = x[cols].astype('category')

In [23]:
x.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [24]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)

Now, the important part: XGBoost comes with its own class for storing datasets called DMatrix. It is a highly optimized class for memory and speed. That's why converting datasets into this format is a requirement for the native XGBoost API:

In [25]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [29]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

This code defines a dictionary called params that contains two hyperparameters for an XGBoost model.

• The first hyperparameter is "objective": "reg:squarederror", which specifies that the model will use mean squared error as the loss function for regression.

• The second hyperparameter is "tree_method": "gpu_hist", which specifies that the model will use GPU acceleration to build histograms for computing gradients during training.

• This can significantly speed up the training process for large datasets.

In [30]:
n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

Now, we set another parameter called num_boost_round, which stands for number of boosting rounds. Internally, XGBoost minimizes the loss function RMSE in small incremental rounds (more on this later). This parameter specifies the amount of those rounds.

The ideal number of rounds is found through hyperparameter tuning. For now, we will just set it to 100:

During the boosting rounds, the model object has learned all the patterns of the training set it possibly can. Now, we must measure its performance by testing it on unseen data. That's where our dtest_reg DMatrix comes into play

In [31]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [32]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 545.388


We’ve got a base score ~543$, which was the performance of a base model with default parameters. There are two ways we can improve it— by performing cross-validation and hyperparameter tuning. But before that, let’s see a quicker way of evaluating XGBoost models.

The problem with our current training process is that we can’t even watch where the model is going. To solve this, we will use evaluation arrays that allow us to see model performance as it gets improved incrementally across boosting rounds.

First, let’s set up the parameters again:

In [33]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 100

When we pass this array to the evals parameter of xgb.train, we will see the model performance after each boosting round:

In [35]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:3985.31595	validation-rmse:3930.87087
[1]	train-rmse:2849.92126	validation-rmse:2812.52945
[2]	train-rmse:2061.76472	validation-rmse:2034.91266
[3]	train-rmse:1521.58802	validation-rmse:1509.03801
[4]	train-rmse:1158.20689	validation-rmse:1155.77477
[5]	train-rmse:918.95666	validation-rmse:922.81058
[6]	train-rmse:765.71970	validation-rmse:778.96367
[7]	train-rmse:671.73734	validation-rmse:692.56259
[8]	train-rmse:612.92636	validation-rmse:638.83852
[9]	train-rmse:578.33182	validation-rmse:608.53984
[10]	train-rmse:557.19710	validation-rmse:591.03042
[11]	train-rmse:542.58916	validation-rmse:578.99646
[12]	train-rmse:534.88302	validation-rmse:573.25964
[13]	train-rmse:527.20423	validation-rmse:566.28647
[14]	train-rmse:520.90582	validation-rmse:561.97563
[15]	train-rmse:515.69808	validation-rmse:558.92935
[16]	train-rmse:512.45290	validation-rmse:557.84167
[17]	train-rmse:507.50759	validation-rmse:556.68519
[18]	train-rmse:504.04144	validation-rmse:553.56230
[19]	train-r

We will use a technique called early stopping. Early stopping forces XGBoost to watch the validation loss, and if it stops improving for a specified number of rounds, it automatically stops training.

This means we can set as high a number of boosting rounds as long as we set a sensible number of early stopping rounds.

In [36]:
n = 10000


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50
)

[0]	train-rmse:3985.31595	validation-rmse:3930.87087
[50]	train-rmse:432.51681	validation-rmse:543.97371
[87]	train-rmse:386.38896	validation-rmse:545.45681


As you can see, the training stopped after the 87th round because the loss stopped improving for 50 rounds before that.

XGBoost Cross-Validation

In [38]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [39]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3985.648654,10.343596,3986.913623,41.642778
1,2848.365726,8.014086,2851.020437,28.028733
2,2063.401458,4.637773,2068.629977,19.969459
3,1521.493751,3.874078,1530.496272,13.59233
4,1156.827103,2.991735,1170.413316,11.695597


It has the same number of rows as the number of boosting rounds. Each row is the average of all splits for that round. So, to find the best score, we take the minimum of the test-rmse-mean column:

In [40]:
best_rmse = results['test-rmse-mean'].min()

best_rmse

550.7196748119261