# XGboost & CATboost

In [20]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import  confusion_matrix, classification_report

In [2]:
data = sns.load_dataset('diamonds')
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [4]:
data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

## Using CATboost

In [10]:
le = LabelEncoder()
for i in data.columns:
    if data[i].dtype == 'category':
        data[i] = le.fit_transform(data[i])   

In [11]:
x = data.drop(['cut'],axis=1)
y = data['cut']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [13]:
model = CatBoostClassifier()
model.fit(x_train,y_train)

Learning rate set to 0.095225
0:	learn: 1.4475089	total: 174ms	remaining: 2m 53s
1:	learn: 1.3343239	total: 206ms	remaining: 1m 42s
2:	learn: 1.2486424	total: 239ms	remaining: 1m 19s
3:	learn: 1.1773722	total: 266ms	remaining: 1m 6s
4:	learn: 1.1206033	total: 300ms	remaining: 59.7s
5:	learn: 1.0720002	total: 325ms	remaining: 53.9s
6:	learn: 1.0347764	total: 351ms	remaining: 49.8s
7:	learn: 0.9981595	total: 374ms	remaining: 46.4s
8:	learn: 0.9676103	total: 398ms	remaining: 43.8s
9:	learn: 0.9412791	total: 428ms	remaining: 42.3s
10:	learn: 0.9182659	total: 459ms	remaining: 41.3s
11:	learn: 0.8995865	total: 485ms	remaining: 39.9s
12:	learn: 0.8833491	total: 514ms	remaining: 39.1s
13:	learn: 0.8673222	total: 539ms	remaining: 37.9s
14:	learn: 0.8533867	total: 567ms	remaining: 37.2s
15:	learn: 0.8428029	total: 593ms	remaining: 36.5s
16:	learn: 0.8316427	total: 619ms	remaining: 35.8s
17:	learn: 0.8205141	total: 646ms	remaining: 35.2s
18:	learn: 0.8128294	total: 677ms	remaining: 34.9s
19:	lear

<catboost.core.CatBoostClassifier at 0x25632b3fbf0>

In [14]:
y_pred = model.predict(x_test)

In [17]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 444   25    3    8    7]
 [  33 1030   24   43  370]
 [   2   11 5986  258  230]
 [   1    8  456 3446  231]
 [   4  185  769  578 2030]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.91      0.91       487
           1       0.82      0.69      0.75      1500
           2       0.83      0.92      0.87      6487
           3       0.80      0.83      0.81      4142
           4       0.71      0.57      0.63      3566

    accuracy                           0.80     16182
   macro avg       0.81      0.78      0.80     16182
weighted avg       0.79      0.80      0.79     16182



## Using XGboost

In [21]:
modelxg = XGBClassifier()
modelxg.fit(x_train,y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [22]:
yxg_pred = modelxg.predict(x_test)


In [23]:
print("Confusion Matrix:\n", confusion_matrix(y_test, yxg_pred))
print("\nClassification Report:\n", classification_report(y_test, yxg_pred))

Confusion Matrix:
 [[ 448   20    3    8    8]
 [  33 1042   26   29  370]
 [   2   15 5958  243  269]
 [   1   14  440 3384  303]
 [   3  199  749  467 2148]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92       487
           1       0.81      0.69      0.75      1500
           2       0.83      0.92      0.87      6487
           3       0.82      0.82      0.82      4142
           4       0.69      0.60      0.64      3566

    accuracy                           0.80     16182
   macro avg       0.81      0.79      0.80     16182
weighted avg       0.80      0.80      0.80     16182

