In [83]:
import pandas as pd
import numpy as np
import seaborn as sns


import matplotlib.pyplot as plt
import shap

from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [84]:
def cost_function(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Costs as they were specified in the challenge
    cost_fp = 10
    cost_fn = 500
    cost_tp = 25
    cost_tn = 0

    # Calculate total cost
    total_cost = fp * cost_fp + fn * cost_fn + tp * cost_tp + tn * cost_tn

    return total_cost

In [85]:
air_systems_present = pd.read_csv("../../data/processed/treated_air_system_present_year.csv")

In [86]:
X_test = air_systems_present.drop(columns="class")
y_test = air_systems_present["class"]

Loading trained model and predicting

Having no model whatsover to identify trucks, we would have the following cost

In [87]:
y_test.sum() * 500

187500.0

Let's predict on the test set.

In [89]:
# Load the model
model = CatBoostClassifier()
model.load_model("best_catboost_model.bin")

<catboost.core.CatBoostClassifier at 0x17cdc6900>

In [90]:
y_pred = model.predict(X_test)

Using our model, instead, we would have the following cost:

In [92]:
cost_function(y_test, y_pred)

57985

If we adjust our decision threshold, we can reduce it even further.

In [93]:
y_pred_prob = model.predict_proba(X_test)
threshold = 0.3621532903099461
y_pred_adjusted = (y_pred_prob[:, 1] >= threshold).astype(int)

In [94]:
cost_function(y_test, y_pred_adjusted)

50535

We get a final 73% cost reduction.

In [95]:
((187500.0 - 50535) / 187500.0) * 100

73.048