In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc


In [2]:
data = pd.read_csv('Data/randomundersampled_data_with_sm_embendings.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('State', axis=1), data['State'], test_size=0.2, random_state=42)
X_train.shape, y_train.shape


((7368, 111), (7368,))

In [3]:
cbc = CatBoostClassifier(learning_rate=0.01,
                         task_type="GPU",
                         devices='0:4',
                         logging_level='Silent',
                         eval_metric =  'TotalF1',
                         )


In [4]:
params = {'depth': np.arange(3, 10),
          'l2_leaf_reg': np.arange(1, 5),
          'bootstrap_type': ['Bayesian', 'Bernoulli', 'Poisson'],
          # 'leaf_estimation_method': [],
          # "loss_function": "F1",
          # 'num_trees': np.arange(1, 20),
            # 'n_estimators': np.arange(3, 20),
          #   'num_boost_round': np.arange(1, 20),
          }
catboost_classifier_best_params = cbc.grid_search(
    params, X=X_train, y=y_train)

catboost_classifier_best_params['params']


0:	loss: 0.7566668	best: 0.7566668 (0)	total: 7.75s	remaining: 10m 43s
1:	loss: 0.7542144	best: 0.7566668 (0)	total: 19.1s	remaining: 13m 2s
2:	loss: 0.0622516	best: 0.7566668 (0)	total: 24.9s	remaining: 11m 11s
3:	loss: 0.7559899	best: 0.7566668 (0)	total: 36.1s	remaining: 12m 1s
4:	loss: 0.7558740	best: 0.7566668 (0)	total: 47.9s	remaining: 12m 36s
5:	loss: 0.0622516	best: 0.7566668 (0)	total: 53.6s	remaining: 11m 36s
6:	loss: 0.7588917	best: 0.7588917 (6)	total: 1m 4s	remaining: 11m 54s
7:	loss: 0.7530950	best: 0.7588917 (6)	total: 1m 16s	remaining: 12m 7s
8:	loss: 0.0622516	best: 0.7588917 (6)	total: 1m 22s	remaining: 11m 26s
9:	loss: 0.7560673	best: 0.7588917 (6)	total: 1m 33s	remaining: 11m 33s
10:	loss: 0.7537685	best: 0.7588917 (6)	total: 1m 45s	remaining: 11m 38s
11:	loss: 0.0622516	best: 0.7588917 (6)	total: 1m 50s	remaining: 11m 5s
12:	loss: 0.7702466	best: 0.7702466 (12)	total: 2m 3s	remaining: 11m 11s
13:	loss: 0.7694214	best: 0.7702466 (12)	total: 2m 15s	remaining: 11m 16

{'depth': 8, 'l2_leaf_reg': 1, 'bootstrap_type': 'Bernoulli'}

In [7]:
cbc.fit(X_train, y_train)


<catboost.core.CatBoostClassifier at 0x2158c3e8940>

In [8]:
cbc_predict = cbc.predict(X_test)
cbc_predict


array([[0],
       [1],
       [4],
       ...,
       [2],
       [4],
       [4]], dtype=int64)

In [9]:
print('Precision, Recall, F\n', classification_report(y_test, cbc_predict))

Precision, Recall, F
               precision    recall  f1-score   support

           0       0.65      0.76      0.70       386
           1       0.81      0.95      0.88       361
           2       0.74      0.59      0.65       381
           3       0.76      0.64      0.70       348
           4       0.99      1.00      0.99       366

    accuracy                           0.79      1842
   macro avg       0.79      0.79      0.79      1842
weighted avg       0.79      0.79      0.78      1842



In [10]:
print('Precision, Recall, F\n', classification_report(y_test, cbc_predict))

Precision, Recall, F
               precision    recall  f1-score   support

           0       0.65      0.76      0.70       386
           1       0.81      0.95      0.88       361
           2       0.74      0.59      0.65       381
           3       0.76      0.64      0.70       348
           4       0.99      1.00      0.99       366

    accuracy                           0.79      1842
   macro avg       0.79      0.79      0.79      1842
weighted avg       0.79      0.79      0.78      1842



In [11]:
import joblib
joblib.dump(cbc, 'Data/CatBoostClassifierDown.sav')

['Data/CatBoostClassifierDown.sav']