In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

In [None]:
# random_state = 100

# if not os.path.isdir("model_dumps"):
#     os.makedirs("model_dumps")

## Data Loading

Set the filename here

In [None]:
CSV_FILENAME = "ths-st3 compiled dataset.csv" #<-- update csv name
df_features = pd.read_csv(CSV_FILENAME)
df_features.head()

```
df_features = df_features.drop(columns = ['Unnamed: 0', 'path', 'source_w', 'source_h', 'face_index'])
df_features = df_features.loc[df_features["e_bbox_yf"] < 3, :]
print(df_features.columns)
```

Change string types to numeric types

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()
categorical_columns = df_features.select_dtypes(include=[bool, object]).columns
encoded_columns = df_features[categorical_columns].apply(encoder.fit_transform)
encoded_columns

In [None]:
df_encoded_features = df_features.copy()
df_encoded_features[categorical_columns] = encoded_columns
df_encoded_features

Split into X and Y

In [None]:
feat_regions = ["bbox", "mask"]

color_channels = {
    "RGB": ("R_BIN_", "G_BIN_", "B_BIN_"),
    "HSV": ("H_HSV_BIN_", "S_HSV_BIN_", "V_HSV_BIN_"),
    "HSL": ("H_HSL_BIN_", "S_HSL_BIN_", "L_HSL_BIN_"),
    "LAB": ("L_LAB_BIN_", "A_LAB_BIN_", "B_LAB_BIN_"),
    "YCBCR": ("Y_BIN_", "CR_BIN_", "CB_BIN_"),
}

#labels = {"bbox": "e_bbox_yf", "mask": "e_face_yf"}
labels = {"mask": "e_bbox_yf", "bbox": "e_face_yf"} #<-- inverted

In [None]:
def get_features_and_label(color_space, region):
    features = ["w", "h", "x", "y", "obj_score", "class_score"]
    for color_channel in color_channels[color_space]: 
        features += [color_channel + region + "_" + str(i) for i in range(26)]
    features += ["LBP_BIN_" + region + "_" + str(i) for i in range(26)]
    features += ["SOBELX_BIN_" + region + "_" + str(i) for i in range(20)]
    features += ["SOBELY_BIN_" + region + "_" + str(i) for i in range(20)]
    features += ["SOBEL_BIN_" + region + "_" + str(i) for i in range(20)]
    return features, labels[region]

In [None]:
from sklearn.model_selection import train_test_split
CHOSEN_COLOR_SPACE = "HSV" #<-- pick a colorspace
CHOSEN_REGION = "bbox"
random_state=100

features, _ = get_features_and_label(CHOSEN_COLOR_SPACE, CHOSEN_REGION)
label = "e_bbox_yf"
X_features =  df_encoded_features.loc[:,  features]
y_features = df_encoded_features.loc[:, label].values #<-- pick label

X_train, X_test, y_train, y_test = train_test_split(X_features, y_features, test_size = 0.2, random_state=random_state)

In [None]:
X_features.columns.tolist()

In [None]:
features

## Load Model

In [None]:
# TODO: LOAD MODEL HERE
from sklearn.base import clone as clone_model
import joblib
import sys
from custom_mlp import CustomMLP
from custom_mlp import custom_scorer

model = joblib.load("model_dumps/penalty_1.1578947368421053_HSV_bbox_e_face_yf.pkl")

In [None]:
model

In [None]:
def positive_error(actual, pred):
    total = 0
    count = 0
    for a,b in zip(pred, actual):
        if a > b:
            total += a - b
            count += 1
    if count == 0:
        return 0
    return total / count
def concealment_ratio(actual, pred):
    count = 0
    for a, b in zip(pred, actual):
        if a >= b:
            count+= 1
            
    return count / len(actual)

print("Test")
y_pred = model.predict(X_test)
print("Positive Error", positive_error(y_test, y_pred))
print("Face Percent", concealment_ratio(y_test, y_pred))

print("Train")
y_pred = model.predict(X_train)
print("Positive Error", positive_error(y_train, y_pred))
print("Face Percent", concealment_ratio(y_train, y_pred))

## SHAP

In [None]:
import shap

In [None]:
explainer = shap.Explainer(model.predict, X_test)
shap_values = explainer(X_test)
# shap_values = explainer.shap_values(X_test)

In [None]:
shap_values

In [None]:
shap.plots.bar(shap_values,  max_display=200)

## Grouped Shap Values

In [None]:
for i in X_test.columns:
    print(i)

In [None]:
def group_shap_features(color_space, region):
    yolo = ["w", "h", "x", "y", "obj_score", "class_score"]
    color = []
    for color_channel in color_channels[color_space]: 
        color.append([color_channel + region + "_" + str(i) for i in range(26)])
    lbp = ["LBP_BIN_" + region + "_" + str(i) for i in range(26)]
    sobel_x = ["SOBELX_BIN_" + region + "_" + str(i) for i in range(20)]
    sobel_y = ["SOBELY_BIN_" + region + "_" + str(i) for i in range(20)]
    sobel = ["SOBEL_BIN_" + region + "_" + str(i) for i in range(20)]
    return (yolo, *color, lbp, sobel_x, sobel_y, sobel)

In [None]:
groups = {}
# groups['XYWH'] = []
# groups['LAB_L'] = []
# groups['LAB_A'] = []
# groups['LAB_B'] = []
# groups['LBP'] = []
# groups['Sobel_X'] = []
# groups['Sobel_Y'] = []
# groups['Sobel'] = []

groups['XYWH'], groups['Hue'], groups['Saturation'], groups['Value'], groups['LBP'], groups['Sobel X'], groups['Sobel Y'], groups['Sobel'] = group_shap_features(CHOSEN_COLOR_SPACE, CHOSEN_REGION)
groups['W'] = groups['XYWH'][0]
groups['H'] = groups['XYWH'][1]
groups['X'] = groups['XYWH'][2]
groups['Y'] = groups['XYWH'][3]
groups['Objectness Score'] = groups['XYWH'][4]
groups['Class Confidence'] = groups['XYWH'][5]
groups.pop('XYWH')

In [None]:
groups

In [None]:
df_data = pd.DataFrame()
df_values = pd.DataFrame()
for dict_idx, dict_item in groups.items():
    counter = 0
    dict_item = [dict_item] if isinstance(dict_item, str) else dict_item
    for item in dict_item:
        counter += 1
        idx = X_test.columns.get_loc(item)
        if counter == 1:
            df_data[dict_idx] = shap_values.data[:, idx]
            df_values[dict_idx] = shap_values.values[:, idx]
        else:
            df_data[dict_idx] += shap_values.data[:, idx]
            df_values[dict_idx] += shap_values.values[:, idx]

In [None]:
aggregates = shap.Explanation(df_values.to_numpy(), base_values=shap_values.base_values, data=df_data.to_numpy(), feature_names=df_data.columns.tolist())

In [None]:
shap.plots.bar(aggregates, max_display=18)

## Directional Impact

In [None]:
for x, y in enumerate(shap_values.feature_names):
    print(x, y)

In [None]:
feat_names = {}
feat_names['XYWH'] = ['W', 'H', 'X', 'Y', "Objectness Score", "Class Confidence"]
feat_names['Hue'] = []
feat_names['Saturation'] = []
feat_names['Value'] = []
feat_names['LBP'] = []
feat_names['Sobel X'] = []
feat_names['Sobel Y'] = []
feat_names['Sobel'] = []

for i in range(26):
    feat_names['Hue'].append('Hue ' + str(i) )
    feat_names['Saturation'].append('Saturation ' + str(i) )
    feat_names['Value'].append('Value ' + str(i) )
    feat_names['LBP'].append('LBP ' + str(i) )

for i in range(20):
    feat_names['Sobel X'].append('Sobel X ' + str(i) )
    feat_names['Sobel Y'].append('Sobel Y ' + str(i) )
    feat_names['Sobel'].append('Sobel ' + str(i) )

In [None]:
shap.summary_plot(shap_values[:, 0:6], feature_names=feat_names['XYWH'], max_display=18)
shap.summary_plot(shap_values[:, 6:32], feature_names=feat_names['Hue'], max_display=18)
shap.summary_plot(shap_values[:, 32:58], feature_names=feat_names['Saturation'], max_display=26)
shap.summary_plot(shap_values[:, 58:84], feature_names=feat_names['Value'], max_display=26)
shap.summary_plot(shap_values[:, 84:110], feature_names=feat_names['LBP'], max_display=26)
shap.summary_plot(shap_values[:, 110:130], feature_names=feat_names['Sobel X'], max_display=20)
shap.summary_plot(shap_values[:, 130:150], feature_names=feat_names['Sobel Y'], max_display=20)
shap.summary_plot(shap_values[:, 150:170], feature_names=feat_names['Sobel'], max_display=20)

In [None]:
shap_values[:, 108:128].shape