In [1]:
# Step 1: Mount Google Drive
from google.colab import drive
import os
import json
import numpy as np
from PIL import Image
from tqdm import tqdm

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import json
import cv2
import re
import numpy as np

# === PATHS ===
videos_path = "/content/drive/MyDrive/NUS_ISS_Talent_Experience_Resumes/cholect50-challenge-val/videos"
labels_path = "/content/drive/MyDrive/NUS_ISS_Talent_Experience_Resumes/cholect50-challenge-val/labels"
label_map_file = "/content/drive/MyDrive/NUS_ISS_Talent_Experience_Resumes/cholect50-challenge-val/label_mapping.txt"

# === STEP 1: LOAD triplet_id → instrument_id mapping ===
instrument_mapping = {}  # triplet_id -> instrument_id
with open(label_map_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        parts = [p.strip() for p in line.split(",") if p.strip() != ""]
        try:
            triplet_id = int(parts[0])
            instrument_id = int(parts[1])  # 2nd column = instrument ID
            instrument_mapping[triplet_id] = instrument_id
        except Exception:
            continue

# Determine number of instruments
mapped_insts = [v for v in instrument_mapping.values() if v >= 0]
num_instruments = max(mapped_insts) + 1 if mapped_insts else 6
print(f"Detected number of instruments = {num_instruments}")

# === STEP 2: BUILD X and Y ===
X, Y = [], []

for vid_folder in sorted(os.listdir(videos_path)):
    vid_path = os.path.join(videos_path, vid_folder)
    if not os.path.isdir(vid_path):
        continue

    label_file = f"{vid_folder}.json"
    label_path = os.path.join(labels_path, label_file)
    if not os.path.exists(label_path):
        print(f"⚠️ Missing label for {vid_folder}, skipping.")
        continue

    with open(label_path, "r") as f:
        data = json.load(f)

    annotations = data.get("annotations", {})

    # Build frame -> list of instrument IDs
    frame_instruments = {}
    for frame_id, triplets in annotations.items():
        try:
            frame_number = int(frame_id)
        except:
            continue

        insts_in_frame = []
        for triplet in triplets:
            if isinstance(triplet, (list, tuple)) and len(triplet) > 0:
                triplet_id = int(triplet[0])
            elif isinstance(triplet, dict):
                triplet_id = int(triplet.get("triplet_id", triplet.get("id", -1)))
            else:
                continue

            inst_id = instrument_mapping.get(triplet_id, -1)
            if inst_id != -1:
                insts_in_frame.append(inst_id)

        if insts_in_frame:
            frame_instruments[frame_number] = sorted(set(insts_in_frame))

    # Process each frame image
    frame_files = sorted([f for f in os.listdir(vid_path) if f.endswith(".png")])
    for frame_file in frame_files:
        match = re.match(r"(\d+)", frame_file)
        if not match:
            continue
        frame_number = int(match.group(1))
        if frame_number not in frame_instruments:
            continue

        img_path = os.path.join(vid_path, frame_file)
        img = cv2.imread(img_path)
        if img is None:
            continue

        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        X.append(img)

        # Create multi-hot vector for instruments in this frame
        vec = np.zeros((num_instruments,), dtype=np.uint8)
        for inst_id in frame_instruments[frame_number]:
            if 0 <= inst_id < num_instruments:
                vec[inst_id] = 1
        Y.append(vec)

# === STEP 3: Convert to numpy arrays ===
X = np.array(X, dtype=np.float32) / 255.0
Y = np.array(Y, dtype=np.uint8)

print(f"✅ Total frames processed: {len(X)}")
print(f"X shape: {X.shape}, Y shape: {Y.shape}")
print("Instrument indices present:", np.where(Y.sum(axis=0) > 0)[0].tolist())

Detected number of instruments = 6
✅ Total frames processed: 1209
X shape: (1209, 224, 224, 3), Y shape: (1209, 6)
Instrument indices present: [0, 1, 2, 3, 4, 5]


In [3]:
# === STEP 4: Per-instrument frame count ===
instrument_counts = Y.sum(axis=0)  # how many frames each instrument appears in

print("\n📊 Per-Instrument Frame Counts:")
for inst_id, count in enumerate(instrument_counts):
    print(f"Instrument {inst_id}: {int(count)} frames")

print(f"\nTotal frames (sum of counts may exceed N because frames can have multiple instruments): {len(X)}")


📊 Per-Instrument Frame Counts:
Instrument 0: 456 frames
Instrument 1: 277 frames
Instrument 2: 96 frames
Instrument 3: 135 frames
Instrument 4: 143 frames
Instrument 5: 256 frames

Total frames (sum of counts may exceed N because frames can have multiple instruments): 1209


In [4]:
# === STEP 4: Per-instrument frame count ===
instrument_counts = Y.sum(axis=0)  # how many frames each instrument appears in

print("\n📊 Per-Instrument Frame Counts:")
for inst_id, count in enumerate(instrument_counts):
    print(f"Instrument {inst_id}: {int(count)} frames")

total_frames = len(X)
print(f"\nTotal frames: {total_frames}")

# === STEP 5: Multi-instrument statistics ===
instruments_per_frame = Y.sum(axis=1)  # how many instruments in each frame
multi_inst_frames = np.sum(instruments_per_frame > 1)
single_inst_frames = np.sum(instruments_per_frame == 1)
no_inst_frames = np.sum(instruments_per_frame == 0)

print(f"\n🧮 Frame composition:")
print(f"Frames with NO instruments: {int(no_inst_frames)}")
print(f"Frames with ONE instrument: {int(single_inst_frames)}")
print(f"Frames with MULTIPLE instruments: {int(multi_inst_frames)}")

# Optional: detailed breakdown (e.g., how many frames have 2, 3, 4 tools)
unique_counts, counts = np.unique(instruments_per_frame, return_counts=True)
print("\nDetailed instrument count per frame:")
for k, v in zip(unique_counts, counts):
    print(f"{int(k)} instruments: {int(v)} frames")


📊 Per-Instrument Frame Counts:
Instrument 0: 456 frames
Instrument 1: 277 frames
Instrument 2: 96 frames
Instrument 3: 135 frames
Instrument 4: 143 frames
Instrument 5: 256 frames

Total frames: 1209

🧮 Frame composition:
Frames with NO instruments: 0
Frames with ONE instrument: 1055
Frames with MULTIPLE instruments: 154

Detailed instrument count per frame:
1 instruments: 1055 frames
2 instruments: 154 frames


In [5]:
instrument_counts = Y.sum(axis=0)
total_frames = len(X)
num_instruments = Y.shape[1]

from sklearn.utils.class_weight import compute_class_weight

# Flatten to single-label form for computing weights
y_single = np.argmax(Y, axis=1)  # crude approximation if you want one label per frame
# OR compute custom multi-label weights:
class_weights = total_frames / (num_instruments * instrument_counts)
class_weights = class_weights / class_weights.sum() * num_instruments  # normalize

print("⚖️ Class weights (normalized):")
for i, w in enumerate(class_weights):
    print(f"Instrument {i}: {w:.3f}")

⚖️ Class weights (normalized):
Instrument 0: 0.381
Instrument 1: 0.627
Instrument 2: 1.810
Instrument 3: 1.287
Instrument 4: 1.215
Instrument 5: 0.679


In [6]:
import cv2
import numpy as np
import random

def random_crop(img, crop_size=(180,180)):
    h, w, _ = img.shape
    ch, cw = crop_size
    if ch > h or cw > w:
        return cv2.resize(img, crop_size)
    top = random.randint(0, h - ch)
    left = random.randint(0, w - cw)
    cropped = img[top:top + ch, left:left + cw]
    return cv2.resize(cropped, (224, 224))

def random_rotate(img, angle_range=(-20, 20)):
    angle = random.uniform(*angle_range)
    h, w = img.shape[:2]
    M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
    rotated = cv2.warpAffine(img, M, (w, h), borderMode=cv2.BORDER_REFLECT_101)
    return rotated

# Augmentation function
def augment_image(img):
    img = random_crop(img)
    img = random_rotate(img)
    return img

# Desired balance target = max instrument frequency
max_count = int(instrument_counts.max())
print(f"\n🎯 Target frames per instrument after balancing: {max_count}")

X_balanced, Y_balanced = list(X), list(Y)

for inst_id in range(num_instruments):
    current_indices = [i for i, y in enumerate(Y) if y[inst_id] == 1]
    current_count = len(current_indices)
    needed = max_count - current_count
    if needed <= 0:
        continue

    print(f"Augmenting instrument {inst_id}: {needed} synthetic samples")

    for _ in range(needed):
        idx = random.choice(current_indices)
        img_aug = augment_image(X[idx])
        X_balanced.append(img_aug)
        Y_balanced.append(Y[idx])

X_balanced = np.array(X_balanced, dtype=np.float32)
Y_balanced = np.array(Y_balanced, dtype=np.uint8)

print(f"\n✅ Balanced dataset created.")
print(f"New X shape: {X_balanced.shape}, Y shape: {Y_balanced.shape}")


🎯 Target frames per instrument after balancing: 456
Augmenting instrument 1: 179 synthetic samples
Augmenting instrument 2: 360 synthetic samples
Augmenting instrument 3: 321 synthetic samples
Augmenting instrument 4: 313 synthetic samples
Augmenting instrument 5: 200 synthetic samples

✅ Balanced dataset created.
New X shape: (2582, 224, 224, 3), Y shape: (2582, 6)


In [7]:
# === STEP 4: Per-instrument frame count ===
instrument_counts = Y_balanced.sum(axis=0)  # how many frames each instrument appears in

print("\n📊 Per-Instrument Frame Counts:")
for inst_id, count in enumerate(instrument_counts):
    print(f"Instrument {inst_id}: {int(count)} frames")

print(f"\nTotal frames (sum of counts may exceed N because frames can have multiple instruments): {len(X_balanced)}")


📊 Per-Instrument Frame Counts:
Instrument 0: 628 frames
Instrument 1: 456 frames
Instrument 2: 456 frames
Instrument 3: 456 frames
Instrument 4: 456 frames
Instrument 5: 456 frames

Total frames (sum of counts may exceed N because frames can have multiple instruments): 2582


In [10]:
import numpy as np

# X is your numpy array of shape (N, H, W, 3), values in 0..1 (if normalized)
# If X is 0..255, remove the /255.0 part

X_min = np.min(X_balanced)
X_max = np.max(X_balanced)
X_mean = np.mean(X_balanced)

print(f"✅ Pixel value stats for X:")
print(f"Min pixel value: {X_min}")
print(f"Max pixel value: {X_max}")
print(f"Mean pixel value: {X_mean:.4f}")

# Optionally, compute per-channel stats
X_mean_channels = np.mean(X_balanced, axis=(0,1,2))
X_min_channels = np.min(X_balanced, axis=(0,1,2))
X_max_channels = np.max(X_balanced, axis=(0,1,2))

print(f"\nPer-channel mean: {X_mean_channels}")
print(f"Per-channel min: {X_min_channels}")
print(f"Per-channel max: {X_max_channels}")

✅ Pixel value stats for X:
Min pixel value: 0.0
Max pixel value: 1.0
Mean pixel value: 0.2805

Per-channel mean: [0.12949936 0.12949936 0.11996699]
Per-channel min: [0. 0. 0.]
Per-channel max: [1. 1. 1.]


In [8]:
from sklearn.model_selection import train_test_split
import numpy as np

# Assume X_balanced and Y_balanced are numpy arrays
# First split into train+val and test
X_temp, X_test, Y_temp, Y_test = train_test_split(
    X_balanced, Y_balanced, test_size=0.15, random_state=42, shuffle=True
)

# Then split temp into train and validation
X_train, X_val, Y_train, Y_val = train_test_split(
    X_temp, Y_temp, test_size=0.15, random_state=42, shuffle=True
)

print(f"Train: {X_train.shape[0]} frames")
print(f"Validation: {X_val.shape[0]} frames")
print(f"Test: {X_test.shape[0]} frames")

Train: 1864 frames
Validation: 330 frames
Test: 388 frames


In [9]:
import tensorflow as tf

BATCH_SIZE = 128
AUTOTUNE = tf.data.AUTOTUNE

def make_dataset(X, Y, batch_size=BATCH_SIZE, training=False):
    dataset = tf.data.Dataset.from_tensor_slices((X, Y))
    if training:
        dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size).prefetch(AUTOTUNE)
    return dataset

train_ds = make_dataset(X_train, Y_train, training=True)
val_ds   = make_dataset(X_val, Y_val, training=False)
test_ds  = make_dataset(X_test, Y_test, training=False)

Model Building

In [10]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import MobileNetV2

IMG_SIZE = 224
num_instruments = Y.shape[1]
drop_rate = 0.3
num_heads = 4  # multihead attention heads
embed_dim = 128  # feature dimension for attention

def build_fast_mha_model(input_shape=(IMG_SIZE, IMG_SIZE, 3), num_classes=num_instruments):
    # === Backbone ===
    base = MobileNetV2(include_top=False, weights='imagenet', input_shape=input_shape, pooling=None)
    base.trainable = False  # freeze backbone for speed

    inp = layers.Input(shape=input_shape)
    x = base(inp)  # shape: (batch, h/32, w/32, channels)

    # Flatten spatial dimensions for attention: (batch, seq_len, channels)
    b, h, w, c = x.shape
    x_flat = layers.Reshape((-1, c))(x)  # seq_len = h*w

    # === Multi-Head Attention ===
    attn_out = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x_flat, x_flat)
    attn_out = layers.GlobalAveragePooling1D()(attn_out)

    # === Dense Head ===
    x = layers.Dropout(drop_rate)(attn_out)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(drop_rate)(x)
    out = layers.Dense(num_classes, activation='sigmoid')(x)  # multi-label output

    model = models.Model(inputs=inp, outputs=out)
    return model

# Build model
model = build_fast_mha_model()
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [11]:
import tensorflow as tf

try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    print("✅ Running on TPU")
except:
    strategy = tf.distribute.get_strategy()
    print("⚠️ TPU not found, running on default strategy (CPU/GPU)")

⚠️ TPU not found, running on default strategy (CPU/GPU)


In [16]:
with strategy.scope():
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(curve='ROC', multi_label=True)]
    )

In [17]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1),
    EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True, verbose=1)
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
    callbacks=callbacks
)

Epoch 1/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 11s/step - auc: 0.5116 - loss: 0.5772 - val_auc: 0.7909 - val_loss: 0.4628 - learning_rate: 1.0000e-04
Epoch 2/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 11s/step - auc: 0.6950 - loss: 0.4485 - val_auc: 0.8619 - val_loss: 0.3664 - learning_rate: 1.0000e-04
Epoch 3/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 11s/step - auc: 0.8410 - loss: 0.3549 - val_auc: 0.9014 - val_loss: 0.3023 - learning_rate: 1.0000e-04
Epoch 4/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 11s/step - auc: 0.9107 - loss: 0.2814 - val_auc: 0.9365 - val_loss: 0.2470 - learning_rate: 1.0000e-04
Epoch 5/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 11s/step - auc: 0.9451 - loss: 0.2253 - val_auc: 0.9526 - val_loss: 0.2197 - learning_rate: 1.0000e-04
Restoring model weights from the end of the best epoch: 5.


In [18]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score, roc_auc_score

# Collect labels and predictions from validation/test set
y_true = np.concatenate([y for _, y in val_ds], axis=0)   # or test_ds
y_probs = np.concatenate([model.predict(x) for x, _ in val_ds], axis=0)
y_pred = (y_probs >= 0.5).astype(int)

# Micro metrics (global)
precision_micro = precision_score(y_true, y_pred, average='micro', zero_division=0)
recall_micro = recall_score(y_true, y_pred, average='micro', zero_division=0)
f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)

# Macro metrics (average per class)
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)

# Per-class metrics
num_classes = y_true.shape[1]
avg_precision = [average_precision_score(y_true[:,i], y_probs[:,i]) for i in range(num_classes)]
roc_auc = [roc_auc_score(y_true[:,i], y_probs[:,i]) for i in range(num_classes)]

print("\n=== Multi-label metrics ===")
print(f"Precision micro: {precision_micro:.4f}, Recall micro: {recall_micro:.4f}, F1 micro: {f1_micro:.4f}")
print(f"Precision macro: {precision_macro:.4f}, Recall macro: {recall_macro:.4f}, F1 macro: {f1_macro:.4f}")
print("Average precision per class:", np.round(avg_precision, 3))
print("ROC-AUC per class:", np.round(roc_auc, 3))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step

=== Multi-label metrics ===
Precision micro: 0.8297, Recall micro: 0.6995, F1 micro: 0.7590
Precision macro: 0.8482, Recall macro: 0.6947, F1 macro: 0.7559
Average precision per class: [0.851 0.814 0.919 0.904 0.781 0.896]
ROC-AUC per class: [0.945 0.914 0.981 0.979 0.933 0.964]


In [19]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score, roc_auc_score
from sklearn.metrics import jaccard_score, hamming_loss

# Collect true labels and predictions from validation/test set
y_true = np.concatenate([y for _, y in val_ds], axis=0)   # or test_ds
y_probs = np.concatenate([model.predict(x) for x, _ in val_ds], axis=0)
y_pred = (y_probs >= 0.5).astype(int)

# ----------------------------
# Micro metrics (global)
# ----------------------------
precision_micro = precision_score(y_true, y_pred, average='micro', zero_division=0)
recall_micro = recall_score(y_true, y_pred, average='micro', zero_division=0)
f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)

# ----------------------------
# Macro metrics (average per class)
# ----------------------------
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)

# ----------------------------
# Per-class metrics
# ----------------------------
num_classes = y_true.shape[1]
avg_precision = [average_precision_score(y_true[:,i], y_probs[:,i]) for i in range(num_classes)]
roc_auc = [roc_auc_score(y_true[:,i], y_probs[:,i]) for i in range(num_classes)]

# ----------------------------
# Multi-label specific metrics
# ----------------------------
jaccard_micro = jaccard_score(y_true, y_pred, average='micro')
jaccard_macro = jaccard_score(y_true, y_pred, average='macro')
hamming = hamming_loss(y_true, y_pred)

# ----------------------------
# Print results
# ----------------------------
print("\n=== Multi-label metrics ===")
print(f"Precision micro: {precision_micro:.4f}, Recall micro: {recall_micro:.4f}, F1 micro: {f1_micro:.4f}")
print(f"Precision macro: {precision_macro:.4f}, Recall macro: {recall_macro:.4f}, F1 macro: {f1_macro:.4f}")
print(f"Jaccard index (micro): {jaccard_micro:.4f}, Jaccard index (macro): {jaccard_macro:.4f}")
print(f"Hamming loss: {hamming:.4f}")
print("Average precision per class:", np.round(avg_precision, 3))
print("ROC-AUC per class:", np.round(roc_auc, 3))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3s/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step

=== Multi-label metrics ===
Precision micro: 0.8297, Recall micro: 0.6995, F1 micro: 0.7590
Precision macro: 0.8482, Recall macro: 0.6947, F1 macro: 0.7559
Jaccard index (micro): 0.6116, Jaccard index (macro): 0.6098
Hamming loss: 0.0843
Average precision per class: [0.851 0.814 0.919 0.904 0.781 0.896]
ROC-AUC per class: [0.945 0.914 0.981 0.979 0.933 0.964]


Different Training Strategy

In [17]:
import tensorflow as tf

def focal_loss_with_class_weights(class_weights, gamma=2.0, alpha=0.25):
    class_weights = tf.constant(class_weights, dtype=tf.float32)

    def loss(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1 - 1e-7)
        bce = -(y_true * tf.math.log(y_pred) + (1 - y_true) * tf.math.log(1 - y_pred))
        fl = alpha * tf.pow(1 - y_pred, gamma) * y_true * bce + \
             (1 - alpha) * tf.pow(y_pred, gamma) * (1 - y_true) * bce
        # apply per-class weights
        weighted_fl = fl * class_weights
        return tf.reduce_mean(weighted_fl)
    return loss

In [21]:
with strategy.scope():
    loss_fn = focal_loss_with_class_weights(class_weights, gamma=2.0, alpha=0.25)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss=loss_fn,
        metrics=[tf.keras.metrics.AUC(curve='PR', multi_label=True, name='pr_auc')]
    )

In [22]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=6,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.5)
    ]
)

Epoch 1/6
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 11s/step - loss: 0.0732 - pr_auc: 0.1980 - val_loss: 0.0436 - val_pr_auc: 0.4353 - learning_rate: 1.0000e-04
Epoch 2/6
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 11s/step - loss: 0.0446 - pr_auc: 0.2948 - val_loss: 0.0350 - val_pr_auc: 0.6151 - learning_rate: 1.0000e-04
Epoch 3/6
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 11s/step - loss: 0.0365 - pr_auc: 0.4522 - val_loss: 0.0284 - val_pr_auc: 0.7269 - learning_rate: 1.0000e-04
Epoch 4/6
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 11s/step - loss: 0.0307 - pr_auc: 0.5953 - val_loss: 0.0234 - val_pr_auc: 0.7880 - learning_rate: 1.0000e-04
Epoch 5/6
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 11s/step - loss: 0.0236 - pr_auc: 0.7407 - val_loss: 0.0193 - val_pr_auc: 0.8604 - learning_rate: 1.0000e-04
Epoch 6/6
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m

In [23]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.5)
    ]
)

Epoch 1/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 11s/step - loss: 0.0205 - pr_auc: 0.8162 - val_loss: 0.0166 - val_pr_auc: 0.8938 - learning_rate: 1.0000e-04
Epoch 2/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 11s/step - loss: 0.0154 - pr_auc: 0.8727 - val_loss: 0.0153 - val_pr_auc: 0.9180 - learning_rate: 1.0000e-04
Epoch 3/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 11s/step - loss: 0.0119 - pr_auc: 0.9151 - val_loss: 0.0144 - val_pr_auc: 0.9303 - learning_rate: 1.0000e-04


In [24]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.5)
    ]
)

Epoch 1/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 11s/step - loss: 0.0103 - pr_auc: 0.9326 - val_loss: 0.0133 - val_pr_auc: 0.9393 - learning_rate: 1.0000e-04
Epoch 2/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 12s/step - loss: 0.0083 - pr_auc: 0.9497 - val_loss: 0.0122 - val_pr_auc: 0.9442 - learning_rate: 1.0000e-04
Epoch 3/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 12s/step - loss: 0.0064 - pr_auc: 0.9593 - val_loss: 0.0125 - val_pr_auc: 0.9456 - learning_rate: 1.0000e-04


In [26]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.5)
    ]
)

Epoch 1/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 13s/step - loss: 0.0066 - pr_auc: 0.9568 - val_loss: 0.0129 - val_pr_auc: 0.9464 - learning_rate: 1.0000e-04
Epoch 2/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 12s/step - loss: 0.0058 - pr_auc: 0.9642 - val_loss: 0.0129 - val_pr_auc: 0.9489 - learning_rate: 1.0000e-04
Epoch 3/3
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 13s/step - loss: 0.0046 - pr_auc: 0.9750 - val_loss: 0.0126 - val_pr_auc: 0.9486 - learning_rate: 1.0000e-04


In [28]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, average_precision_score, roc_auc_score
from sklearn.metrics import jaccard_score, hamming_loss

# Collect true labels and predictions from validation/test set
y_true = np.concatenate([y for _, y in val_ds], axis=0)   # or test_ds
y_probs = np.concatenate([model.predict(x) for x, _ in val_ds], axis=0)
y_pred = (y_probs >= 0.5).astype(int)

# ----------------------------
# Micro metrics (global)
# ----------------------------
precision_micro = precision_score(y_true, y_pred, average='micro', zero_division=0)
recall_micro = recall_score(y_true, y_pred, average='micro', zero_division=0)
f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0)

# ----------------------------
# Macro metrics (average per class)
# ----------------------------
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)

# ----------------------------
# Per-class metrics
# ----------------------------
num_classes = y_true.shape[1]
avg_precision = [average_precision_score(y_true[:,i], y_probs[:,i]) for i in range(num_classes)]
roc_auc = [roc_auc_score(y_true[:,i], y_probs[:,i]) for i in range(num_classes)]

# ----------------------------
# Multi-label specific metrics
# ----------------------------
jaccard_micro = jaccard_score(y_true, y_pred, average='micro')
jaccard_macro = jaccard_score(y_true, y_pred, average='macro')
hamming = hamming_loss(y_true, y_pred)

# ----------------------------
# Print results
# ----------------------------
print("\n=== Multi-label metrics ===")
print(f"Precision micro: {precision_micro:.4f}, Recall micro: {recall_micro:.4f}, F1 micro: {f1_micro:.4f}")
print(f"Precision macro: {precision_macro:.4f}, Recall macro: {recall_macro:.4f}, F1 macro: {f1_macro:.4f}")
print(f"Jaccard index (micro): {jaccard_micro:.4f}, Jaccard index (macro): {jaccard_macro:.4f}")
print(f"Hamming loss: {hamming:.4f}")
print("Average precision per class:", np.round(avg_precision, 3))
print("ROC-AUC per class:", np.round(roc_auc, 3))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step

=== Multi-label metrics ===
Precision micro: 0.9325, Recall micro: 0.7672, F1 micro: 0.8418
Precision macro: 0.9331, Recall macro: 0.7777, F1 macro: 0.8434
Jaccard index (micro): 0.7268, Jaccard index (macro): 0.7336
Hamming loss: 0.0551
Average precision per class: [0.917 0.974 0.983 0.957 0.927 0.937]
ROC-AUC per class: [0.964 0.988 0.996 0.982 0.974 0.979]


In [29]:
model.save('tool_recognition_model.keras')