In [151]:
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import plotly.express as px
import shap
import seaborn as sns
from IPython import display
import matplotlib.pyplot as plt
import awswrangler as wr
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    multilabel_confusion_matrix,
)
from catboost.utils import select_threshold
from skmultilearn.model_selection.iterative_stratification import (
    IterativeStratification,
)

In [152]:
wr.config.s3_endpoint_url = "http://192.168.1.7:8333"


In [153]:

multilabelDf = wr.s3.read_parquet(
    path=f"s3://multilabel_df/",
    dataset=True,
)

In [154]:
multilabelDf

Unnamed: 0,vision_bonnet,vision_bumper_front,vision_grille,vision_headlamp_rh,vision_headlamp_lh,vision_door_front_lh,vision_door_front_rh,vision_engine,vision_bumper_rear,vision_misc,...,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear,CaseID,Circumstances_of_Accident
0,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,12657878,Collision- Head to Rear (Insured Hit TP)
1,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,12657888,Lost control- Overturned
2,1,1,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657890,
3,1,1,1,0,0,0,1,1,0,1,...,1,0,0,0,0,0,0,0,12657900,Collided into animal
4,1,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,12657903,Collision- Head to Rear (Insured Hit TP)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621787,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10147236,
621788,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10936139,Collision- Head to Rear (TP Hit Insured)
621789,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10174259,
621790,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,10174498,


In [155]:
allVisionFeatures = [x for x in multilabelDf.columns if "vision_" in x]
caseFeatures = [
    # "Circumstances_of_Accident",
   
]
allInputFeature = caseFeatures + allVisionFeatures
targetCol = [
    x
    for x in multilabelDf.columns
    if x not in allInputFeature and x != "CaseID"
    and x != "Circumstances_of_Accident"
]
len(targetCol)

34

In [156]:
multilabelDf[targetCol].head(10)

Unnamed: 0,bonnet,bumper_front,grille,fog_lamp_rh,headlamp_lh,headlamp_rh,door_front_lh,door_front_rh,air_conditioning,cooling_fan,...,fog_lamp_lh,tail_lamp_rh,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,1,1,1,...,0,1,1,0,0,0,0,0,0,0
4,1,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,1,1,1,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
6,1,1,1,0,1,1,0,0,1,1,...,0,0,1,0,0,0,0,0,0,0
7,1,1,1,0,1,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
8,1,1,1,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,1,1,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0


In [157]:
targetCol

['bonnet',
 'bumper_front',
 'grille',
 'fog_lamp_rh',
 'headlamp_lh',
 'headlamp_rh',
 'door_front_lh',
 'door_front_rh',
 'air_conditioning',
 'cooling_fan',
 'radiator',
 'bumper_rear',
 'misc',
 'engine',
 'airbag',
 'front_panel',
 'undercarriage',
 'wheel',
 'fender_front_lh',
 'fender_front_rh',
 'door_mirror_rh',
 'rear_quarter_lh',
 'interior',
 'tail_lamp_lh',
 'fog_lamp_lh',
 'tail_lamp_rh',
 'windscreen_front',
 'rear_compartment',
 'rear_panel',
 'rear_quarter_rh',
 'door_rear_rh',
 'door_mirror_lh',
 'door_rear_lh',
 'windscreen_rear']

In [161]:
X = multilabelDf[allInputFeature]
Y = multilabelDf[targetCol]
X.dtypes


vision_bonnet              Int64
vision_bumper_front        Int64
vision_grille              Int64
vision_headlamp_rh         Int64
vision_headlamp_lh         Int64
vision_door_front_lh       Int64
vision_door_front_rh       Int64
vision_engine              Int64
vision_bumper_rear         Int64
vision_misc                Int64
vision_front_panel         Int64
vision_non_external        Int64
vision_wheel               Int64
vision_fender_front_lh     Int64
vision_fender_front_rh     Int64
vision_rear_quarter_lh     Int64
vision_tail_lamp_lh        Int64
vision_tail_lamp_rh        Int64
vision_windscreen_front    Int64
vision_rear_compartment    Int64
vision_rear_panel          Int64
vision_rear_quarter_rh     Int64
vision_door_rear_rh        Int64
vision_door_rear_lh        Int64
dtype: object

In [163]:
X = multilabelDf[allInputFeature]
Y = multilabelDf[targetCol]
skf = IterativeStratification(n_splits=2)
for train_index, test_index in skf.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y.loc[train_index], Y.loc[test_index]
    train_pool = Pool(X_train, Y_train, cat_features=caseFeatures + allVisionFeatures)
    test_pool = Pool(X_test, Y_test, cat_features=caseFeatures + allVisionFeatures)

TypeError: no supported conversion for types: (dtype('O'),)

In [None]:
allPosCount = {}
for i in targetCol:
    posCount = len(Y_train[Y_train[i] == 1]) / len(Y_train)
    allPosCount[i] = posCount
allPosCount

{'bonnet': 0.3534643953819498,
 'bumper_front': 0.6355394301202546,
 'grille': 0.35878021374779134,
 'fog_lamp_rh': 0.1349154272382619,
 'headlamp_lh': 0.4339393237610005,
 'headlamp_rh': 0.43828804487674333,
 'door_front_lh': 0.10931201001835555,
 'door_front_rh': 0.11594016434220232,
 'air_conditioning': 0.28096641106136244,
 'cooling_fan': 0.15867256788979808,
 'radiator': 0.3163737498498962,
 'bumper_rear': 0.37533237266910263,
 'misc': 0.37593064347348737,
 'engine': 0.3229375739797231,
 'airbag': 0.08985641500694766,
 'front_panel': 0.22570891873809892,
 'undercarriage': 0.204520697167756,
 'wheel': 0.15823512257046302,
 'fender_front_lh': 0.21062777692004186,
 'fender_front_rh': 0.2259362187569691,
 'door_mirror_rh': 0.06505498087248898,
 'rear_quarter_lh': 0.05553196781774827,
 'interior': 0.1017189027842108,
 'tail_lamp_lh': 0.11928319009143465,
 'fog_lamp_lh': 0.10224855471497435,
 'tail_lamp_rh': 0.1283730465064416,
 'windscreen_front': 0.16998181599849038,
 'rear_compartmen

In [None]:
Y

Unnamed: 0,bonnet,bumper_front,grille,fog_lamp_rh,headlamp_lh,headlamp_rh,door_front_lh,door_front_rh,air_conditioning,cooling_fan,...,fog_lamp_lh,tail_lamp_rh,windscreen_front,rear_compartment,rear_panel,rear_quarter_rh,door_rear_rh,door_mirror_lh,door_rear_lh,windscreen_rear
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,1,1,1,...,0,1,1,0,0,0,0,0,0,0
4,1,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
621788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
621789,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
621790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
clf = CatBoostClassifier(
    loss_function="MultiCrossEntropy",
    eval_metric="Accuracy",
    iterations=20,
    # task_type="GPU",
    class_names=targetCol,
)
clf.fit(train_pool, eval_set=test_pool, metric_period=5, plot=True, verbose=5)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.0534627	test: 0.0539923	best: 0.0539923 (0)	total: 4.53s	remaining: 1m 26s
5:	learn: 0.4984303	test: 0.4969829	best: 0.4969829 (5)	total: 27.7s	remaining: 1m 4s
10:	learn: 0.5053051	test: 0.5033194	best: 0.5033194 (10)	total: 51.6s	remaining: 42.2s
15:	learn: 0.5127695	test: 0.5108911	best: 0.5108911 (15)	total: 1m 14s	remaining: 18.6s
19:	learn: 0.5135501	test: 0.5117081	best: 0.5117081 (19)	total: 1m 34s	remaining: 0us

bestTest = 0.5117080953
bestIteration = 19



<catboost.core.CatBoostClassifier at 0x7f186cb98310>

In [None]:
test_predict = clf.predict(X_test)
train_predict = clf.predict(X_train)
test_predict

array([[0, 1, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [None]:
test_predict_prob = clf.predict_proba(X_test)
test_predict_prob.shape

(155448, 34)

In [None]:
rescaledPred = test_predict_prob > 0.5
rescaledPred = rescaledPred.astype('uint8')
test_predict = rescaledPred

In [None]:
def hamming_score(y_true, y_pred):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list), acc_list

In [None]:
accuracy = eval_metric(Y_test.values, test_predict, "Accuracy")[0]
print(f"Accuracy: {accuracy}")

Accuracy: 0.5117080953116154


In [None]:
subsetAcc2, acc_list = hamming_score(Y_test.values, test_predict)
subsetAcc2

0.9288017811101247

In [None]:
# acc = accuracy_score(Y_test.values, test_predict)
rounded_preds = np.argmax(test_predict, axis=1)
rounded_gt = np.argmax(Y_test.values, axis=1)
clsOrder = clf.classes_

confMat = multilabel_confusion_matrix(rounded_gt, rounded_preds)
exactMatchAcc = accuracy_score(Y_test.values.astype(np.int64), test_predict)
allMetricsByPart = []
for part, row in zip(clsOrder, confMat):
    tn = row[0][0]
    tp = row[1][1]
    fp = row[0][1]
    fn = row[1][0]
    totalSample = fp + fn + tp + tn
    acc = (tp + tn) / (fp + fn + tp + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    assert (tp / (tp + fn)) + (fn / (tp + fn)) == (tn / (tn + fp)) + (
        fp / (tn + fp)
    )
    # print(tp)
    allMetricsByPart.append(
        {
            "part": part,
            "tp": tp / (tp + fn),
            "tn": tn / (tn + fp),
            "fp": fp / (tn + fp),
            "fn": fn / (tp + fn),
            "acc": acc,
            "precision": precision,
            "recall": recall,
            "f1": (2 * precision * recall) / (precision + recall),
            "pos_count" : allPosCount[part]
        }
    )
evalMetrics = pd.json_normalize(allMetricsByPart)
avgPrecision = evalMetrics["precision"].mean()
avgRecall = evalMetrics["recall"].mean()
avgF1 = evalMetrics["f1"].mean()
avgTp = evalMetrics["tp"].mean()
avgTn = evalMetrics["tn"].mean()
avgAcc = evalMetrics["acc"].mean()
avgFn = evalMetrics["fn"].mean()

print(f"Avg Precision : {avgPrecision}")
print(f"Avg Recall : {avgRecall}")
print(f"Avg F1 : {avgF1}")
print(f"Avg TP : {avgTp}")
print(f"Avg TN : {avgTn}")
print(f"Avg FN : {avgFn}")

print(f"avgAccs : {avgAcc}")
evalMetrics

Avg Precision : 0.909386122766545
Avg Recall : 0.6972843259171798
Avg F1 : 0.8504109423126344
Avg TP : 0.6972843259171798
Avg TN : 0.9996077648014771
Avg FN : 0.3027156740828203
avgAccs : 0.9992533913364634



invalid value encountered in long_scalars



Unnamed: 0,part,tp,tn,fp,fn,acc,precision,recall,f1,pos_count
0,bonnet,1.0,0.99975,0.00025,0.0,0.999839,0.999548,1.0,0.999774,0.353464
1,bumper_front,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.635539
2,grille,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.35878
3,fog_lamp_rh,0.815223,0.998744,0.001256,0.184777,0.996603,0.8845,0.815223,0.84845,0.134915
4,headlamp_lh,0.97981,0.998582,0.001418,0.02019,0.998379,0.883298,0.97981,0.929054,0.433939
5,headlamp_rh,0.806277,0.998725,0.001275,0.193723,0.997581,0.79087,0.806277,0.798499,0.438288
6,door_front_lh,0.989037,0.999987,1.3e-05,0.010963,0.999723,0.99946,0.989037,0.994221,0.109312
7,door_front_rh,0.999741,0.99998,2e-05,0.000259,0.999974,0.999223,0.999741,0.999482,0.11594
8,air_conditioning,0.015504,0.999994,6e-06,0.984496,0.99836,0.8,0.015504,0.030418,0.280966
9,cooling_fan,0.0,1.0,0.0,1.0,0.99899,,0.0,,0.158673


In [None]:
avgDf = pd.DataFrame(
    {
        "AvgRecall" : [avgRecall],
        "AvgPrecision" : [avgPrecision],
        "AvgF1" : [avgF1],
        "ExactMatch" : [accuracy],
        "SubsetAccuracy" : [subsetAcc2],


    }
).transpose().reset_index()
# fig = px.bar(avgDf, x = 'part', y = 'recall')
fig = px.bar(avgDf, x = 'index', y = 0, text_auto=True)

fig

In [None]:
fig = px.bar(evalMetrics, x = 'part', y = 'pos_count')

fig.add_hline(y=avgFn, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [None]:
fig = px.bar(evalMetrics, x = 'part', y = 'fn')

fig.add_hline(y=avgFn, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [None]:
fig = px.bar(evalMetrics, x = 'part', y = 'recall')

fig.add_hline(y=avgRecall, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [None]:
fig = px.bar(evalMetrics, x = 'part', y = 'precision')

fig.add_hline(y=avgRecall, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [None]:
fig = px.bar(evalMetrics, x = 'part', y = 'tp')

fig.add_hline(y=avgTp, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [None]:
fig = px.bar(evalMetrics, x = 'part', y = 'tn')

fig.add_hline(y=avgTn, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [None]:
fig = px.bar(evalMetrics, x = 'part', y = 'acc')

fig.add_hline(y=avgAcc, line_width=2, line_color="red")

fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = [np.format_float_positional(avgRecall, 1), 0.2, 0.5, 0.7],
    )
)
fig

In [None]:
evalMetrics['acc'].corr(evalMetrics["pos_count"])

-0.3574339418190395