In [20]:
from catboost import CatBoostClassifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc


In [81]:
data = pd.read_csv('Data/smoted_data_with_sm_embendings.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('State', axis=1), data['State'], test_size=0.2, random_state=42)
X_train.shape, y_train.shape


((790444, 111), (790444,))

In [91]:
cbc = CatBoostClassifier(learning_rate=0.01,
                         task_type="GPU",
                         devices='0:4',
                         logging_level='Silent',
                         eval_metric =  'TotalF1',
                         )


In [92]:
params = {'depth': np.arange(3, 10),
          'l2_leaf_reg': np.arange(1, 5),
          'bootstrap_type': ['Bayesian', 'Bernoulli', 'Poisson'],
          # 'leaf_estimation_method': [],
          # "loss_function": "F1",
          # 'num_trees': np.arange(1, 20),
            # 'n_estimators': np.arange(3, 20),
          #   'num_boost_round': np.arange(1, 20),
          }
catboost_classifier_best_params = cbc.grid_search(
    params, X=X_train, y=y_train)

catboost_classifier_best_params['params']


0:	loss: 0.8153131	best: 0.8153131 (0)	total: 20.8s	remaining: 28m 49s
1:	loss: 0.8160018	best: 0.8160018 (1)	total: 41.2s	remaining: 28m 11s
2:	loss: 0.0664009	best: 0.8160018 (1)	total: 47.5s	remaining: 21m 21s
3:	loss: 0.8154288	best: 0.8160018 (1)	total: 1m 7s	remaining: 22m 35s
4:	loss: 0.8156570	best: 0.8160018 (1)	total: 1m 28s	remaining: 23m 14s
5:	loss: 0.0664009	best: 0.8160018 (1)	total: 1m 34s	remaining: 20m 25s
6:	loss: 0.8152880	best: 0.8160018 (1)	total: 1m 54s	remaining: 21m 1s
7:	loss: 0.8152856	best: 0.8160018 (1)	total: 2m 15s	remaining: 21m 23s
8:	loss: 0.0664009	best: 0.8160018 (1)	total: 2m 21s	remaining: 19m 41s
9:	loss: 0.8157930	best: 0.8160018 (1)	total: 2m 42s	remaining: 20m 1s
10:	loss: 0.8154890	best: 0.8160018 (1)	total: 3m 2s	remaining: 20m 14s
11:	loss: 0.0664009	best: 0.8160018 (1)	total: 3m 9s	remaining: 18m 56s
12:	loss: 0.8385699	best: 0.8385699 (12)	total: 3m 33s	remaining: 19m 27s
13:	loss: 0.8382484	best: 0.8385699 (12)	total: 3m 58s	remaining: 19

{'depth': 9, 'l2_leaf_reg': 1, 'bootstrap_type': 'Bernoulli'}

In [93]:
cbc.fit(X_train, y_train)


<catboost.core.CatBoostClassifier at 0x16499566580>

In [94]:
cbc_predict = cbc.predict(X_test)
cbc_predict


array([[1],
       [2],
       [4],
       ...,
       [4],
       [1],
       [3]], dtype=int64)

In [95]:
print('Precision, Recall, F\n', classification_report(y_test, cbc_predict))

Precision, Recall, F
               precision    recall  f1-score   support

           0       0.87      0.94      0.90     39688
           1       0.90      0.98      0.94     39403
           2       0.90      0.78      0.84     39397
           3       0.90      0.87      0.88     39403
           4       1.00      1.00      1.00     39720

    accuracy                           0.91    197611
   macro avg       0.91      0.91      0.91    197611
weighted avg       0.91      0.91      0.91    197611



In [None]:
print('Precision, Recall, F\n', classification_report(y_test, cbc_predict))

Precision, Recall, F
               precision    recall  f1-score   support

           0       0.87      0.94      0.90     39688
           1       0.90      0.98      0.94     39403
           2       0.90      0.78      0.84     39397
           3       0.90      0.87      0.88     39403
           4       1.00      1.00      1.00     39720

    accuracy                           0.91    197611
   macro avg       0.91      0.91      0.91    197611
weighted avg       0.91      0.91      0.91    197611



In [96]:
import joblib
joblib.dump(cbc, 'Data/CatBoostClassifier.sav')

['Data/CatBoostClassifier.sav']