In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
import os
import json
import numpy as np
from PIL import Image
from tqdm import tqdm

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import json
import cv2
import re
import numpy as np

# Paths to videos and labels
videos_path = "/content/drive/MyDrive/NUS_ISS_Talent_Experience_Resumes/cholect50-challenge-val/videos"
labels_path = "/content/drive/MyDrive/NUS_ISS_Talent_Experience_Resumes/cholect50-challenge-val/labels"

# Phase mapping (from your earlier inspection)
phase_mapping = {
    '0': 'preparation',
    '1': 'carlot-triangle-dissection',
    '2': 'clipping-and-cutting',
    '3': 'gallbladder-dissection',
    '4': 'gallbladder-packaging',
    '5': 'cleaning-and-coagulation',
    '6': 'gallbladder-extraction'
}

X = []
Y = []

# Loop through all videos
for vid_folder in sorted(os.listdir(videos_path)):
    vid_path = os.path.join(videos_path, vid_folder)

    # Skip non-folders
    if not os.path.isdir(vid_path):
        continue

    # Corresponding label JSON
    label_file = f"{vid_folder}.json"
    label_path = os.path.join(labels_path, label_file)

    if not os.path.exists(label_path):
        print(f"Warning: {label_path} not found, skipping {vid_folder}")
        continue

    # Load JSON labels
    with open(label_path, 'r') as f:
        data = json.load(f)

    # Build a mapping of frame_number -> phase_id
    annotations = data['annotations']
    frame_phase = {}
    for frame_id, triplets in annotations.items():
        frame_number = int(frame_id)
        if len(triplets) > 0:
            # Phase ID is at index 14 in triplet vector
            phase_id = triplets[0][14]
        else:
            phase_id = 0  # default to 0 if no triplet
        frame_phase[frame_number] = phase_id

    # Process all frame images
    frame_files = sorted([f for f in os.listdir(vid_path) if f.endswith('.png')])
    for frame_file in frame_files:
        # Extract leading number from filename (handles '000030 (1).png')
        match = re.match(r"(\d+)", frame_file)
        if match:
            frame_number = int(match.group(1))
        else:
            continue  # skip invalid filenames

        phase_id = frame_phase.get(frame_number, 0)

        # Read image
        img_path = os.path.join(vid_path, frame_file)
        img = cv2.imread(img_path)
        if img is None:
            continue

        # Resize and convert to RGB
        img = cv2.resize(img, (224, 224))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        X.append(img)
        Y.append(phase_id)

# Convert lists to numpy arrays
X = np.array(X, dtype=np.float32) / 255.0  # normalize
Y = np.array(Y, dtype=np.int64)

print(f"Total frames processed: {len(X)}")
print(f"X shape: {X.shape}, Y shape: {Y.shape}")

Total frames processed: 1318
X shape: (1318, 224, 224, 3), Y shape: (1318,)


In [None]:
import numpy as np
import albumentations as A
import cv2
from tqdm import tqdm
from collections import Counter

# Define augmentation pipeline
augment = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.3),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, p=0.5),
    A.RandomCrop(height=200, width=200, p=0.5),
    A.Resize(height=224, width=224)  # make sure final size is correct
])

# Target minimum per class
MIN_SAMPLES = 300

X_balanced, Y_balanced = [], []

# Count original distribution
counts = Counter(Y)
print("Original class distribution:", counts)

# Loop through each class
for cls in np.unique(Y):
    cls_indices = np.where(Y == cls)[0]
    X_cls = X[cls_indices]
    Y_cls = Y[cls_indices]

    # Always keep originals
    X_balanced.extend(X_cls)
    Y_balanced.extend(Y_cls)

    # How many more needed?
    n_to_add = max(0, MIN_SAMPLES - len(X_cls))

    if n_to_add > 0:
        print(f"Augmenting class {cls} with {n_to_add} new samples...")
        for _ in tqdm(range(n_to_add)):
            idx = np.random.randint(0, len(X_cls))
            img = X_cls[idx]

            # Albumentations expects uint8
            img = (img * 255).astype(np.uint8)

            aug_img = augment(image=img)["image"]

            # Scale back to [0,1]
            aug_img = aug_img.astype(np.float32) / 255.0

            X_balanced.append(aug_img)
            Y_balanced.append(cls)

# Convert to arrays
X_balanced = np.array(X_balanced)
Y_balanced = np.array(Y_balanced)

# Check new distribution
new_counts = Counter(Y_balanced)
print("New class distribution:", new_counts)
print("Final shapes -> X:", X_balanced.shape, ", Y:", Y_balanced.shape)

  original_init(self, **validated_kwargs)


Original class distribution: Counter({np.int64(3): 338, np.int64(2): 282, np.int64(1): 217, np.int64(0): 149, np.int64(5): 143, np.int64(4): 115, np.int64(6): 74})
Augmenting class 0 with 151 new samples...


100%|██████████| 151/151 [00:00<00:00, 465.70it/s]


Augmenting class 1 with 83 new samples...


100%|██████████| 83/83 [00:00<00:00, 484.06it/s]


Augmenting class 2 with 18 new samples...


100%|██████████| 18/18 [00:00<00:00, 492.72it/s]


Augmenting class 4 with 185 new samples...


100%|██████████| 185/185 [00:00<00:00, 491.25it/s]


Augmenting class 5 with 157 new samples...


100%|██████████| 157/157 [00:00<00:00, 468.95it/s]


Augmenting class 6 with 226 new samples...


100%|██████████| 226/226 [00:00<00:00, 475.76it/s]


New class distribution: Counter({np.int64(3): 338, np.int64(0): 300, np.int64(1): 300, np.int64(2): 300, np.int64(4): 300, np.int64(5): 300, np.int64(6): 300})
Final shapes -> X: (2138, 224, 224, 3) , Y: (2138,)


In [None]:
# Print new class distribution
counter_new = Counter(Y_balanced)
print("\nNew class distribution:")
for cls, count in counter_new.items():
    print(f"Phase {cls}: {count} frames")

X_aug = np.array(X_balanced)
Y_aug = np.array(Y_balanced)
print(f"\nTotal frames after augmentation: {len(Y_aug)}")
print(f"X_aug shape: {X_aug.shape}, Y_aug shape: {Y_aug.shape}")


New class distribution:
Phase 0: 300 frames
Phase 1: 300 frames
Phase 2: 300 frames
Phase 3: 338 frames
Phase 4: 300 frames
Phase 5: 300 frames
Phase 6: 300 frames

Total frames after augmentation: 2138
X_aug shape: (2138, 224, 224, 3), Y_aug shape: (2138,)


In [None]:
import tensorflow as tf
# Stratified split
from sklearn.model_selection import train_test_split
trainX, tempX, trainY, tempY = train_test_split(X_aug, Y_aug, test_size=0.2, stratify=Y_aug, random_state=42)
valX, testX, valY, testY = train_test_split(tempX, tempY, test_size=0.5, stratify=tempY, random_state=42)

print("Train:", trainX.shape, trainY.shape)
print("Val:", valX.shape, valY.shape)
print("Test:", testX.shape, testY.shape)

Train: (1710, 224, 224, 3) (1710,)
Val: (214, 224, 224, 3) (214,)
Test: (214, 224, 224, 3) (214,)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model

num_classes = 7
IMG_SIZE = (224, 224, 3)
SEED = 42

# ---------------- Transfer Learning Backbone ----------------
base_model = MobileNetV2(
    input_shape=IMG_SIZE,
    include_top=False,
    weights='imagenet'
)
base_model.trainable = False  # freeze backbone initially

inputs = layers.Input(shape=IMG_SIZE)
x = base_model(inputs, training=False)
x = layers.GlobalAveragePooling2D()(x)  # shape -> (batch, channels)

# ---------------- Multi-Head Attention ----------------
# reshape to (batch, sequence_len=1, channels) for attention
x_reshaped = layers.Reshape((1, x.shape[-1]))(x)
attn_output = layers.MultiHeadAttention(
    num_heads=4, key_dim=64
)(x_reshaped, x_reshaped)
attn_output = layers.Flatten()(attn_output)

# ---------------- Classification Head ----------------
x = layers.Concatenate()([x, attn_output])
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(num_classes, activation='softmax')(x)

model = models.Model(inputs, outputs)

# ---------------- Compile ----------------
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='sparse_categorical_crossentropy',  # integer labels 0..6
    metrics=['accuracy']
)

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
history = model.fit(
    trainX, trainY,          # integer labels
    validation_data=(valX, valY),
    epochs=10,
    batch_size=16
)

Epoch 1/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 721ms/step - accuracy: 0.4317 - loss: 1.6642 - val_accuracy: 0.7804 - val_loss: 0.6714
Epoch 2/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 742ms/step - accuracy: 0.7953 - loss: 0.5685 - val_accuracy: 0.7804 - val_loss: 0.6664
Epoch 3/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 692ms/step - accuracy: 0.8433 - loss: 0.4539 - val_accuracy: 0.8037 - val_loss: 0.6636
Epoch 4/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 827ms/step - accuracy: 0.8762 - loss: 0.3840 - val_accuracy: 0.8178 - val_loss: 0.7497
Epoch 5/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 693ms/step - accuracy: 0.9209 - loss: 0.2564 - val_accuracy: 0.8598 - val_loss: 0.5408
Epoch 6/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 681ms/step - accuracy: 0.9046 - loss: 0.2914 - val_accuracy: 0.7430 - val_loss: 1.0655
Epoch 7/10

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint(
    'best_model.keras',           # file name to save
    monitor='val_accuracy',       # metric to monitor
    save_best_only=True,          # only save when val_accuracy improves
    mode='max',                   # because we want to maximize accuracy
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=5,                   # stop after 5 epochs with no improvement
    mode='max',
    restore_best_weights=True,    # automatically restore best weights
    verbose=1
)

In [None]:
history = model.fit(
    trainX, trainY,          # integer labels
    validation_data=(valX, valY),
    epochs=5,
    batch_size=16,
    callbacks=[checkpoint, early_stopping]
)

Epoch 1/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 599ms/step - accuracy: 0.9546 - loss: 0.1739
Epoch 1: val_accuracy improved from -inf to 0.77570, saving model to best_model.keras
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 706ms/step - accuracy: 0.9546 - loss: 0.1737 - val_accuracy: 0.7757 - val_loss: 1.6758
Epoch 2/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 598ms/step - accuracy: 0.9466 - loss: 0.1803
Epoch 2: val_accuracy did not improve from 0.77570
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 696ms/step - accuracy: 0.9466 - loss: 0.1803 - val_accuracy: 0.7570 - val_loss: 1.3807
Epoch 3/5
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 595ms/step - accuracy: 0.9106 - loss: 0.3237
Epoch 3: val_accuracy improved from 0.77570 to 0.83178, saving model to best_model.keras
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 698ms/step - accuracy: 0.9107 - loss

In [None]:
y_prediction = model.predict(testX)
y_pred = np.argmax(y_prediction, axis = 1)
from sklearn.metrics import accuracy_score
print(f"Test Accuracy : {accuracy_score(y_pred,testY)}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1s/step
Test Accuracy : 0.8037383177570093


In [None]:
model.save('phase_recognition_model.keras')