# CatBoost

In [1]:
import numpy as np
from catboost import CatBoostClassifier
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.metrics import cohen_kappa_score, confusion_matrix, brier_score_loss
from sklearn.metrics import matthews_corrcoef, fowlkes_mallows_score, r2_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve, roc_curve, top_k_accuracy_score

# Load the forest cover type dataset
data = fetch_covtype()
X, y = data.data, data.target

# Adjust labels to be in the range 0 to 6
y = y - 1

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a CatBoost model
model = CatBoostClassifier(verbose=0)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test).flatten()  # Flatten the predictions to ensure they are 1D
y_prob = model.predict_proba(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
pr_auc = average_precision_score(y_test, y_prob, average='macro')
log_loss_value = log_loss(y_test, y_prob)
kappa = cohen_kappa_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate Brier score for each class and average it
num_classes = len(set(y))
brier_scores = [brier_score_loss(y_test == i, y_prob[:, i]) for i in range(num_classes)]
brier_score_avg = np.mean(brier_scores)

mcc = matthews_corrcoef(y_test, y_pred)
fmi = fowlkes_mallows_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Binarize the output
y_test_bin = label_binarize(y_test, classes=range(num_classes))

# Calculate precision-recall curve and ROC curve for each class
precision_dict, recall_dict, fpr_dict, tpr_dict = {}, {}, {}, {}
for i in range(num_classes):
    precision_dict[i], recall_dict[i], _ = precision_recall_curve(y_test_bin[:, i], y_prob[:, i])
    fpr_dict[i], tpr_dict[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    auc = roc_auc_score(y_test_bin[:, i], y_prob[:, i])
    print(f"AUC-ROC for Class {i}: {auc:.3f}")

# Calculate top-k accuracy
top_k_acc = top_k_accuracy_score(y_test, y_prob, k=3)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"AUC-ROC: {roc_auc:.3f}")
print(f"AUC-PR: {pr_auc:.3f}")
print(f"Log Loss: {log_loss_value:.3f}")
print(f"Cohen's Kappa: {kappa:.3f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Average Brier Score: {brier_score_avg:.3f}")
print(f"Matthews Correlation Coefficient: {mcc:.3f}")
print(f"Fowlkes-Mallows Index: {fmi:.3f}")
print(f"Coefficient of Determination (R^2): {r2:.3f}")
print(f"Top-3 Accuracy: {top_k_acc:.3f}")

AUC-ROC for Class 0: 0.970
AUC-ROC for Class 1: 0.967
AUC-ROC for Class 2: 0.998
AUC-ROC for Class 3: 0.999
AUC-ROC for Class 4: 0.994
AUC-ROC for Class 5: 0.997
AUC-ROC for Class 6: 0.999
Accuracy: 0.887
Precision: 0.887
Recall: 0.826
F1-Score: 0.852
AUC-ROC: 0.989
AUC-PR: 0.931
Log Loss: 0.300
Cohen's Kappa: 0.817
Confusion Matrix:
[[36652  5634     2     0    30     6   233]
 [ 4071 51919   206     1   125   151    27]
 [    1   240  6536    36     3   305     0]
 [    0     0    79   425     0    22     0]
 [   28   732    37     0  1187    11     0]
 [   12   223   463    19     2  2770     0]
 [  431    27     0     0     0     0  3557]]
Average Brier Score: 0.025
Matthews Correlation Coefficient: 0.817
Fowlkes-Mallows Index: 0.805
Coefficient of Determination (R^2): 0.744
Top-3 Accuracy: 1.000
