In [1]:
from catboost import CatBoostClassifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay, auc


In [2]:
data = pd.read_csv('Data/only_names.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('State', axis=1), data['State'], test_size=0.2, random_state=42)
X_train.shape, y_train.shape


((790444, 96), (790444,))

In [3]:
cbc = CatBoostClassifier(learning_rate=0.01,
                         task_type="GPU",
                         devices='0:4',
                         logging_level='Silent',
                         eval_metric =  'TotalF1',
                         )


In [4]:
params = {'depth': np.arange(3, 10),
          'l2_leaf_reg': np.arange(1, 5),
          'bootstrap_type': ['Bayesian', 'Bernoulli', 'Poisson'],
          # 'leaf_estimation_method': [],
          # "loss_function": "F1",
          # 'num_trees': np.arange(1, 20),
            # 'n_estimators': np.arange(3, 20),
          #   'num_boost_round': np.arange(1, 20),
          }
catboost_classifier_best_params = cbc.grid_search(
    params, X=X_train, y=y_train)

catboost_classifier_best_params['params']


0:	loss: 0.4941453	best: 0.4941453 (0)	total: 18.9s	remaining: 26m 10s
1:	loss: 0.4942803	best: 0.4942803 (1)	total: 37.4s	remaining: 25m 31s
2:	loss: 0.0664125	best: 0.4942803 (1)	total: 43.4s	remaining: 19m 32s
3:	loss: 0.4941884	best: 0.4942803 (1)	total: 1m 1s	remaining: 20m 33s
4:	loss: 0.4944817	best: 0.4944817 (4)	total: 1m 20s	remaining: 21m 8s
5:	loss: 0.0664125	best: 0.4944817 (4)	total: 1m 26s	remaining: 18m 42s
6:	loss: 0.4943793	best: 0.4944817 (4)	total: 1m 44s	remaining: 19m 11s
7:	loss: 0.4944616	best: 0.4944817 (4)	total: 2m 3s	remaining: 19m 31s
8:	loss: 0.0664125	best: 0.4944817 (4)	total: 2m 9s	remaining: 17m 59s
9:	loss: 0.4939338	best: 0.4944817 (4)	total: 2m 27s	remaining: 18m 15s
10:	loss: 0.4946758	best: 0.4946758 (10)	total: 2m 46s	remaining: 18m 26s
11:	loss: 0.0664125	best: 0.4946758 (10)	total: 2m 52s	remaining: 17m 17s
12:	loss: 0.5163211	best: 0.5163211 (12)	total: 3m 15s	remaining: 17m 45s
13:	loss: 0.5155387	best: 0.5163211 (12)	total: 3m 37s	remaining:

{'depth': 9, 'l2_leaf_reg': 1, 'bootstrap_type': 'Bernoulli'}

In [7]:
cbc.fit(X_train, y_train)


<catboost.core.CatBoostClassifier at 0x1eb814ceb20>

In [8]:
cbc_predict = cbc.predict(X_test)
cbc_predict


array([[0],
       [2],
       [1],
       ...,
       [4],
       [0],
       [3]], dtype=int64)

In [9]:
print('Precision, Recall, F\n', classification_report(y_test, cbc_predict))

Precision, Recall, F
               precision    recall  f1-score   support

           0       0.52      0.54      0.53     39544
           1       0.47      0.44      0.46     39476
           2       0.89      0.60      0.72     39472
           3       0.89      0.86      0.88     39367
           4       0.63      0.86      0.73     39752

    accuracy                           0.66    197611
   macro avg       0.68      0.66      0.66    197611
weighted avg       0.68      0.66      0.66    197611



In [10]:
print('Precision, Recall, F\n', classification_report(y_test, cbc_predict))

Precision, Recall, F
               precision    recall  f1-score   support

           0       0.52      0.54      0.53     39544
           1       0.47      0.44      0.46     39476
           2       0.89      0.60      0.72     39472
           3       0.89      0.86      0.88     39367
           4       0.63      0.86      0.73     39752

    accuracy                           0.66    197611
   macro avg       0.68      0.66      0.66    197611
weighted avg       0.68      0.66      0.66    197611

