In [4]:
!pip install --user xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/63/ca/37b83f59b0efd919c03c52ad7e2473dced674f2f6eb07b9d6f7d80e4c54c/xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl.metadata
  Using cached xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.2


In [1]:
import seaborn as sns

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import warnings


warnings.filterwarnings("ignore")

#For tutorial purposes, load the "diamonds" dataset from the seaborn library
diamonds = sns.load_dataset("diamonds")

diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
#The shape attribute returns a tuple that contains the number of rows and columns in the object.
diamonds.shape

(53940, 10)

In [3]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [4]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [5]:
#In this tutorial, we will first try to predict diamond prices using their physical measurements, so our target will
#be the price column.
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

In [6]:
#The dataset has three categorical columns. Normally, you would encode them with ordinal or one-hot encoding, but 
#XGBoost has the ability to internally deal with categoricals.

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

In [7]:
#Now, when you print the dtypes attribute, you'll see that we have three category features:
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [8]:
# Split the data (0.25 test size)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
#Now, the important part: XGBoost comes with its own class for storing datasets called DMatrix. It is a highly 
#optimized class for memory and speed. That's why converting datasets into this format is a requirement for the 
#native XGBoost API:

import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

## Python XGBoost Regression

In [10]:
import numpy as np

mse = np.mean((actual - predicted) ** 2)
rmse = np.sqrt(mse)

NameError: name 'actual' is not defined

In [12]:
# Define hyperparameters for a regression model using XGBoost library in Python.
# "objective": "reg:squarederror" specifies that the objective of the model is to minimize the mean squared error, 
#and "tree_method": "gpu_hist" specifies that the model should use GPU acceleration for histogram-based tree 
#building.
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

#n is defined with a value of 100, which specifies the number of boosting rounds (iterations) for the model.
n = 100

#This function trains the XGBoost regression model with the specified hyperparameters and returns the trained model.
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

### Evaluation

In [13]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [16]:
#Once you generate predictions with predict, you pass them inside mean_squared_error function of Sklearn to compare 
#against y_test

rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 555.607


### Using Validation Sets During Training

In [18]:
#First, let’s set up the parameters again:

params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 100

In [20]:
#Next, we create a list of two tuples that each contain two elements (the array for the model to evaluate, 
#and the array’s name.)

evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

In [21]:
#When we pass this array to the evals parameter of xgb.train, we will see the model performance after each boosting 
#round:

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:2874.29379	validation-rmse:2817.38773
[1]	train-rmse:2092.07711	validation-rmse:2054.73630
[2]	train-rmse:1549.52687	validation-rmse:1526.30592
[3]	train-rmse:1184.46798	validation-rmse:1174.90119
[4]	train-rmse:941.09127	validation-rmse:943.28272
[5]	train-rmse:784.58014	validation-rmse:796.09651
[6]	train-rmse:685.75110	validation-rmse:705.22245
[7]	train-rmse:624.67281	validation-rmse:653.32563
[8]	train-rmse:584.19599	validation-rmse:620.30404
[9]	train-rmse:558.77667	validation-rmse:599.24504
[10]	train-rmse:543.85303	validation-rmse:586.99790
[11]	train-rmse:531.92694	validation-rmse:578.68120
[12]	train-rmse:523.08456	validation-rmse:571.73527
[13]	train-rmse:515.67753	validation-rmse:567.19913
[14]	train-rmse:510.77594	validation-rmse:564.66402
[15]	train-rmse:506.68519	validation-rmse:563.21547
[16]	train-rmse:502.96796	validation-rmse:561.80880
[17]	train-rmse:498.90184	validation-rmse:560.36561
[18]	train-rmse:492.74859	validation-rmse:558.46274
[19]	train-rms

In [22]:
#can also add "verbose_eval" and early_stopping_rounds" params to model

### Cross-Validation

In [24]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [25]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2874.530912,9.57651,2877.437274,37.09354
1,2089.327469,8.31729,2094.021636,24.828795
2,1550.617973,5.223297,1558.386252,18.540267
3,1183.812759,5.19342,1195.032441,13.47158
4,941.203113,4.539805,958.728828,9.479449


In [28]:
best_rmse = results['test-rmse-mean'].min()
print("Best RMSE: ", best_rmse)

Best RMSE:  549.311480649509


## Classification

In [31]:
#Building an XGBoost classifier is as easy as changing the objective function; the rest can stay the same.

#We want to predict the cut quality of diamonds given their price and their physical measurements. 

from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
   X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [32]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [33]:
params = {"objective": "multi:softprob", "tree_method": "gpu_hist", "num_class": 5}
n = 1000

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=n,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
)

In [35]:
results.keys()

Index(['train-mlogloss-mean', 'train-mlogloss-std', 'train-auc-mean',
       'train-auc-std', 'train-merror-mean', 'train-merror-std',
       'test-mlogloss-mean', 'test-mlogloss-std', 'test-auc-mean',
       'test-auc-std', 'test-merror-mean', 'test-merror-std'],
      dtype='object')

In [36]:
results['test-auc-mean'].max()

0.9402233623451636