In [1]:
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import plotly.express as px
import shap
import seaborn as sns
from IPython import display
import matplotlib.pyplot as plt
import awswrangler as wr


In [2]:
wr.config.s3_endpoint_url = "http://192.168.1.7:8333"


In [3]:

multilabelDf = wr.s3.read_parquet(
    path=f"s3://multilabel_df/",
    dataset=True,
)

In [4]:
multilabelDf

Unnamed: 0,vision_bonnet,vision_bumper_front,vision_grille,vision_headlamp_rh,vision_headlamp_lh,vision_door_front_lh,vision_door_front_rh,vision_engine,vision_bumper_rear,vision_misc,...,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear,CaseID,Circumstances_of_Accident
0,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,12657878,Collision- Head to Rear (Insured Hit TP)
1,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,12657888,Lost control- Overturned
2,1,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657890,
3,1,1,1,0,0,0,1,1,0,1,...,1,0,0,0,0,0,0,0,12657900,Collided into animal
4,1,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657903,Collision- Head to Rear (Insured Hit TP)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621787,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10147236,
621788,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10936139,Collision- Head to Rear (TP Hit Insured)
621789,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10174259,
621790,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10174498,


In [5]:
allVisionFeatures = [x for x in multilabelDf.columns if "vision_" in x]
caseFeatures = [
    "Circumstances_of_Accident",
   
]
allInputFeature = caseFeatures + allVisionFeatures
targetCol = [
    x
    for x in multilabelDf.columns
    if x not in allInputFeature and x != "CaseID"
]
len(targetCol)

34

In [6]:
multilabelDf[targetCol].head(10)

Unnamed: 0,bonnet,bumper_front,grille,fog_lamp_rh,headlamp_lh,headlamp_rh,door_front_lh,door_front_rh,air_conditioning,cooling_fan,...,fog_lamp_lh,tail_lamp_rh,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,1,1,1,...,0,1,1,0,0,0,0,0,0,0
4,1,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1,1,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
6,1,1,1,0,1,1,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0
7,1,1,1,0,1,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
8,1,1,1,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,1,1,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0


In [7]:
X = multilabelDf[allInputFeature]
Y = multilabelDf[targetCol]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
train_pool = Pool(X_train, Y_train, cat_features=caseFeatures + allVisionFeatures)
test_pool = Pool(X_test, Y_test, cat_features=caseFeatures + allVisionFeatures)

In [8]:
Y

Unnamed: 0,bonnet,bumper_front,grille,fog_lamp_rh,headlamp_lh,headlamp_rh,door_front_lh,door_front_rh,air_conditioning,cooling_fan,...,fog_lamp_lh,tail_lamp_rh,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,1,1,1,...,0,1,1,0,0,0,0,0,0,0
4,1,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
621788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
621789,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
621790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
clf = CatBoostClassifier(
    loss_function="MultiCrossEntropy",
    eval_metric="Accuracy",
    iterations=20,
    # task_type="GPU",
    class_names=targetCol,
)
clf.fit(train_pool, eval_set=test_pool, metric_period=5, plot=True, verbose=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.0532504	test: 0.0539602	best: 0.0539602 (0)	total: 5.58s	remaining: 1m 46s
5:	learn: 0.4988678	test: 0.4972081	best: 0.4972081 (5)	total: 31.3s	remaining: 1m 12s
10:	learn: 0.5078483	test: 0.5057190	best: 0.5057190 (10)	total: 54.3s	remaining: 44.4s
15:	learn: 0.5121027	test: 0.5096174	best: 0.5096174 (15)	total: 1m 17s	remaining: 19.4s
19:	learn: 0.5133056	test: 0.5107431	best: 0.5107431 (19)	total: 1m 36s	remaining: 0us

bestTest = 0.5107431424
bestIteration = 19



<catboost.core.CatBoostClassifier at 0x7f65c2bfe5f0>

In [10]:
test_predict = clf.predict(X_test)
train_predict = clf.predict(X_train)

In [11]:
def hamming_score(y_true, y_pred):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list), acc_list

In [12]:
accuracy = eval_metric(Y_test.values, test_predict, "Accuracy")[0]
print(f"Accuracy: {accuracy}")

Accuracy: 0.5107431424013175


In [13]:
subsetAcc2, acc_list = hamming_score(Y_test.values, test_predict)
subsetAcc2

0.9282804068578041

In [24]:
avgPrecision = 0
avgRecall = 0
avgF1 = 0
allRecall = []
allPrecision = []
clsOrder = clf.classes_
allF1 = []
allSupport = []
allAcc = []
allBalanceAcc = []
highRecallCount = 0
lowRecallCount = 0
recallLowThreshold = 0.2
highRecallThreshold = 0.6
for metric in ("Precision", "Recall", "F1", "Accuracy"):
    # print(metric)
    values = eval_metric(Y_test.values, test_predict, metric)
    if metric == "Precision":
        avgPrecision = np.mean(values)
    elif metric == "Recall":
        avgRecall = np.mean(values)

    elif metric == "F1":
        avgF1 = np.mean(values)
    for cls, value in zip(clf.classes_, values):
        
        # print(f"class={cls}: {value:.4f}")
        if metric == "Recall":
            allRecall.append(value)
            if value <= recallLowThreshold:
                lowRecallCount += 1
            elif value >= highRecallThreshold:
                highRecallCount += 1
        elif(metric == "Precision"):
            allPrecision.append(value)
        elif(metric == "F1"):
            allF1.append(value)
        elif metric == "Accuracy":
            allAcc.append(value)
    print()
print(f"Avg Precision : {avgPrecision}")
print(f"Avg Recall : {avgRecall}")
print(f"Avg F1 : {avgF1}")
print(f"Low Recall Count : {lowRecallCount}")
print(f"High Recall Count : {highRecallCount}")
print(f"all acc : {allAcc}")

summaryPerfDf = pd.DataFrame({
    "recall" : allRecall,
    "precision" : allPrecision,
    "f1" : allF1,
    "part" :  clsOrder,
    "avgRecall" : avgRecall,
    "avgPrecision" : avgPrecision,
    "avgF1" : avgF1,


})





Avg Precision : 0.890978878901951
Avg Recall : 0.853915450322246
Avg F1 : 0.8642732147360281
Low Recall Count : 1
High Recall Count : 27
all acc : [0.5107431424013175]


In [25]:
# acc = accuracy_score(Y_test.values, test_predict)
# acc

In [26]:
avgDf = pd.DataFrame(
    {
        "AvgRecall" : [avgRecall],
        "AvgPrecision" : [avgPrecision],
        "AvgF1" : [avgF1],
        "ExactMatch" : [accuracy],
        "SubsetAccuracy" : [subsetAcc2],


    }
).transpose().reset_index()
# fig = px.bar(avgDf, x = 'part', y = 'recall')
fig = px.bar(avgDf, x = 'index', y = 0, text_auto=True)

fig

In [27]:
fig = px.bar(summaryPerfDf, x = 'part', y = 'recall')

fig.add_hline(y=avgRecall, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [28]:
fig = px.bar(summaryPerfDf, x = 'part', y = 'precision')

fig.add_hline(y=avgRecall, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [29]:
fig = px.bar(summaryPerfDf, x = 'part', y = 'f1')

fig.add_hline(y=avgRecall, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig