In [57]:
import xgboost as xgb
import seaborn as sns 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [10]:
warnings.filterwarnings("ignore")

In [11]:
diamonds = sns.load_dataset("diamonds")

In [12]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [13]:
diamonds.shape

(53940, 10)

In [17]:
diamonds["cut"].value_counts()

cut
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: count, dtype: int64

In [18]:
diamonds["color"].value_counts()

color
G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: count, dtype: int64

In [23]:
diamonds["clarity"].value_counts()

clarity
SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: count, dtype: int64

In [24]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [30]:
diamonds.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [32]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [34]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [41]:
X = diamonds.drop("price", axis=1)
y = diamonds[["price"]]

In [45]:
cats = X.select_dtypes(exclude=np.number).columns.to_list()

In [47]:
for col in cats:
    X[col] = X[col].astype("category")

In [48]:
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=61)

In [50]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [56]:
params = {"objective": "reg:squarederror"}
n = 100 

model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round = n,
)

In [58]:
preds = model.predict(dtest_reg)

In [62]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model:{rmse:.3f}")

RMSE of the base model:514.778


In [63]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)



[0]	train-rmse:2871.40509	validation-rmse:2813.29158
[1]	train-rmse:2089.35792	validation-rmse:2046.65294
[2]	train-rmse:1550.04545	validation-rmse:1522.04779
[3]	train-rmse:1189.65519	validation-rmse:1171.18031
[4]	train-rmse:950.96235	validation-rmse:936.60914
[5]	train-rmse:793.77430	validation-rmse:784.85420
[6]	train-rmse:693.54274	validation-rmse:687.16390
[7]	train-rmse:635.31799	validation-rmse:629.35653
[8]	train-rmse:596.09465	validation-rmse:592.73682
[9]	train-rmse:571.12638	validation-rmse:569.81542
[10]	train-rmse:555.73643	validation-rmse:555.39150
[11]	train-rmse:540.88303	validation-rmse:543.82730
[12]	train-rmse:532.51471	validation-rmse:536.34924
[13]	train-rmse:525.84164	validation-rmse:533.43163
[14]	train-rmse:520.67341	validation-rmse:528.47979
[15]	train-rmse:516.00284	validation-rmse:524.87772
[16]	train-rmse:510.50779	validation-rmse:522.33978
[17]	train-rmse:506.09554	validation-rmse:522.24376
[18]	train-rmse:500.32232	validation-rmse:521.60448
[19]	train-rms

In [64]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=10
)



[0]	train-rmse:2871.40509	validation-rmse:2813.29158
[10]	train-rmse:555.73643	validation-rmse:555.39150
[20]	train-rmse:494.20207	validation-rmse:519.31433
[30]	train-rmse:469.21246	validation-rmse:515.29249
[40]	train-rmse:450.54601	validation-rmse:514.12101
[50]	train-rmse:437.15974	validation-rmse:513.32779
[60]	train-rmse:423.38987	validation-rmse:514.37018
[70]	train-rmse:410.37009	validation-rmse:514.51679
[80]	train-rmse:400.33109	validation-rmse:515.53165
[90]	train-rmse:389.77463	validation-rmse:514.91782
[99]	train-rmse:380.71137	validation-rmse:514.77768


In [65]:
n = 10000
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50
)


[0]	train-rmse:2871.40509	validation-rmse:2813.29158
[50]	train-rmse:437.15974	validation-rmse:513.32779
[100]	train-rmse:379.49768	validation-rmse:514.83671


In [66]:
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)



In [74]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2872.354589,5.106472,2874.090261,24.250401
1,2090.551707,4.429606,2096.25616,18.724507
2,1552.863406,3.285129,1563.658597,12.564446
3,1187.802983,2.123129,1203.203491,9.70083
4,947.092027,2.003722,967.899626,8.163701


In [69]:
best_rmse = results["test-rmse-mean"].min()
best_rmse

553.168906263618