### Library


In [None]:
!pip install ultralytics

Collecting ultralytics
  Using cached ultralytics-8.3.177-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Using cached ultralytics_thop-2.0.15-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Using ca

In [None]:
!pip install filterpy

Collecting filterpy
  Downloading filterpy-1.4.5.zip (177 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/178.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.0/178.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: filterpy
  Building wheel for filterpy (setup.py) ... [?25l[?25hdone
  Created wheel for filterpy: filename=filterpy-1.4.5-py3-none-any.whl size=110460 sha256=98072f05b90a6428cb27a034f76b205ad6efc999feae37c21e6d281c88489b29
  Stored in directory: /root/.cache/pip/wheels/12/dc/3c/e12983eac132d00f82a20c6cbe7b42ce6e96190ef8fa2d15e1
Successfully built filterpy
Installing collected packages: filterpy
Successfully installed filterpy-1.4.5


In [None]:
!git clone https://github.com/abewley/sort.git

fatal: destination path 'sort' already exists and is not an empty directory.


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms

import cv2
import numpy as np
from collections import deque, defaultdict
import torchvision.transforms.functional as F

# face detectors
from sort.sort import Sort
from ultralytics import YOLO

# Webcam on google colab
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import PIL
import io
import html
import time
from PIL import Image
from io import BytesIO
import json

### Config

In [None]:
VIDEO_SOURCE = "1D_Video.mp4"  # Or use 0 for webcam
# VIDEO_SOURCE = 0
MODEL_PATH = "Resnet50_Freeze12_LSTM_Transform_Dropout_epoch_6.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEQUENCE_LENGTH = 25
THRESHOLD = 0.5
print(DEVICE)

cuda


### Load Model

In [None]:
class CNNLSTM(nn.Module):
    def __init__(self, cnn_output_dim = 256, lstm_hidden_dim = 128, num_layers=1):
        super(CNNLSTM, self).__init__()

        # Use a pre-trained model and strip the classification head
        # For Resnet50
        base_cnn = models.resnet50(pretrained=True)

        # Freeze early layers
        for name, param in base_cnn.named_parameters():
            if name.startswith("conv1") or name.startswith("bn1") or \
               name.startswith("layer1") or name.startswith("layer2"):
                param.requires_grad = False

        self.cnn = nn.Sequential(*list(base_cnn.children())[:-2],  # keep conv layers
                                 nn.AdaptiveAvgPool2d((1, 1)))    # output [B, 512, 1, 1]

        self.feature_reduce = nn.Linear(2048, cnn_output_dim)

      #   # For Efficientnet_b0
      #   base_cnn = models.efficientnet_b0(pretrained=True)
      #   # Freeze first 4 blocks (out of 9 total in EfficientNet-B0)
      #   for idx, block in enumerate(base_cnn.features):
      #       if idx < 4:
      #           for param in block.parameters():
      #               param.requires_grad = False

      #  # Then keep only the convolutional base
      #   self.cnn = nn.Sequential(base_cnn.features, nn.AdaptiveAvgPool2d((1, 1)))

      #   self.feature_reduce = nn.Linear(1280, cnn_output_dim)

        self.lstm = nn.LSTM(input_size=cnn_output_dim,
                            hidden_size=lstm_hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=0.2 if num_layers > 1 else 0.0)

        self.input_dropout = nn.Dropout(p=0.1)
        self.hidden_dropout = nn.Dropout(p=0.3)

        self.classifier = nn.Linear(lstm_hidden_dim, 1)  # Binary classification
        # self.dropout = nn.Dropout(config.dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):  # x: [B, T, 3, 128, 128]
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)  # Flatten for CNN: [B*T, 3, 128, 128]

        features = self.cnn(x)      # [B*T, 2048, 1, 1]
        features = features.view(B * T, 2048)  # [B*T, 2048]

        features = self.input_dropout(features)               # Input dropout
        features = self.feature_reduce(features)  # [B*T, cnn_output_dim]
        features = features.view(B, T, -1)     # [B, T, cnn_output_dim]

        lstm_out, _ = self.lstm(features)      # [B, T, lstm_hidden_dim]
        # lstm_out, (h_n, c_n) = self.lstm(features)

        # print(lstm_out.size())

        last_output = lstm_out[:, -1, :]       # Take last frame's output
        # last_hidden = h_n[-1]  # Take last layer’s hidden state [B, hidden_dim]

        last_output = self.hidden_dropout(last_output)        # Hidden dropout
        out = self.classifier(last_output)     # [B, 1]
        # return self.sigmoid(out).squeeze(1)    # [B]
        return out.squeeze(1)    # [B]

# Hidden state
# Output layer

In [None]:
model = CNNLSTM()
model.load_state_dict(torch.load(MODEL_PATH, map_location = DEVICE))
model.to(DEVICE)
model.eval()

CNNLSTM(
  (cnn): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(64, 256,

### Frames Preprocessing

In [None]:
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
#     # transforms.Normalize([0.485, 0.456, 0.406],
#     #                      [0.229, 0.224, 0.225])
# ])

In [None]:
# --- Preprocess Function ---
def preprocess_face(face):
    face = cv2.resize(face, (224, 224))
    face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
    face = F.to_tensor(face)
    # face = F.normalize(face, mean=[0.485, 0.456, 0.406],
    #                          std=[0.229, 0.224, 0.225])
    return face

# def preprocess_face(face_img):
#     face_rgb = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
#     return transform(face_rgb)

### Read Video Using OpenCV and Sample Frames

In [None]:
face_detector = YOLO("yolov8m_200e.pt")

In [None]:
# Output video writer
# --- Initialize OpenCV Video ---
cap = cv2.VideoCapture(VIDEO_SOURCE)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(f'output1D_Resnet50_Freeze12_LSTM_Transform_Dropout_epoch_6.mp4', fourcc, 25.0, (int(cap.get(3)), int(cap.get(4))))

#### Without Zoom

In [None]:
# # --- Initialize OpenCV Video ---
# cap = cv2.VideoCapture(VIDEO_SOURCE)

# --- Sequence Buffer ---
sequence = deque(maxlen=SEQUENCE_LENGTH)

tracker = Sort()
sequence_dict = {}  # To store per-ID face sequences

while cap.isOpened:
    ret, frame = cap.read()
    if not ret:
        print("cannot read")
        break

    result_list = face_detector(frame, verbose=False)

    if len(result_list) == 0:
        continue  # Skip this frame if no detection result

    results = result_list[0]

    if results.boxes is None or len(results.boxes) == 0:
        continue  # No faces detected, skip this frame

    boxes = results.boxes  # Proceed safely now
    detections = []

    for i in range(len(boxes)):
        box = boxes.xyxy[i]
        conf = boxes.conf[i]

        x1, y1, x2, y2 = map(int, box)
        conf = float(conf)
        detections.append([x1, y1, x2, y2, conf])

    # Track with SORT
    tracked_objects = tracker.update(np.array(detections))

    for track in tracked_objects:
        x1, y1, x2, y2, track_id = map(int, track)
        face = frame[y1:y2, x1:x2]

        # Preprocess
        processed = preprocess_face(face)
        sequence = sequence_dict.get(track_id, deque(maxlen=25))
        sequence.append(processed)
        sequence_dict[track_id] = sequence

        # Predict if enough frames collected
        if len(sequence) == 25:
            input_tensor = torch.stack(list(sequence)).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                output = model(input_tensor)
                prob = torch.sigmoid(output).item()

            label = f"Speaking ({prob:.2f})" if prob > THRESHOLD else f"Not Speaking ({prob:.2f})"
            color = (0, 255, 0) if prob > THRESHOLD else (0, 0, 255)
        else:
            label = f"ID {track_id} - Collecting..."
            color = (200, 200, 0)

        # Annotate
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    out.write(frame)
    # Optionally show: cv2_imshow(frame)

cap.release()
out.release()

cannot read


#### With Zoom

In [None]:
# from collections import deque, defaultdict
# import numpy as np
# import cv2
# import torch

PAD = 40                  # pixels of padding around the zoom box
STICK_FRAMES = 10         # keep current speaker for some frames to avoid flicker

sequence_dict = defaultdict(lambda: deque(maxlen=25))
last_speaker_id = None
stick_counter = 0

def safe_zoom(frame, box, pad=40):
    """Crop padded box and resize back to original size."""
    h, w = frame.shape[:2]
    x1, y1, x2, y2 = map(int, box)
    x1 = max(x1 - pad, 0)
    y1 = max(y1 - pad, 0)
    x2 = min(x2 + pad, w)
    y2 = min(y2 + pad, h)
    # guard against empty crop
    if x2 <= x1 or y2 <= y1:
        return frame
    roi = frame[y1:y2, x1:x2]
    return cv2.resize(roi, (w, h))

# --- Initialize OpenCV Video ---
cap = cv2.VideoCapture(VIDEO_SOURCE)

tracker = Sort()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        print("cannot read")
        break

    # --- Face detection (YOLO) ---
    result_list = face_detector(frame, verbose=False)
    if len(result_list) == 0:
        out.write(frame)
        continue

    results = result_list[0]
    if results.boxes is None or len(results.boxes) == 0:
        out.write(frame)
        continue

    boxes = results.boxes
    detections = []
    for i in range(len(boxes)):
        box = boxes.xyxy[i]
        conf = float(boxes.conf[i])
        x1, y1, x2, y2 = map(int, box)
        detections.append([x1, y1, x2, y2, conf])

    # --- Tracking ---
    tracked_objects = tracker.update(np.array(detections))

    # Collect per-track probabilities for this frame
    frame_probs = {}      # track_id -> prob
    frame_labels = {}     # track_id -> label text
    frame_colors = {}     # track_id -> color BGR
    frame_boxes = {}      # track_id -> (x1,y1,x2,y2)

    for tr in tracked_objects:
        x1, y1, x2, y2, track_id = map(int, tr)
        frame_boxes[track_id] = (x1, y1, x2, y2)

        # Preprocess face for your model
        face = frame[y1:y2, x1:x2]
        processed = preprocess_face(face)           # must match training transforms
        seq = sequence_dict[track_id]
        seq.append(processed)

        if len(seq) == 25:
            input_tensor = torch.stack(list(seq)).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                output = model(input_tensor)
                prob = float(torch.sigmoid(output).item())
            frame_probs[track_id] = prob

            speaking = prob > THRESHOLD
            label = f"{'Speaking' if speaking else 'Not Speaking'} ({prob:.2f})"
            color = (0, 255, 0) if speaking else (0, 0, 255)
        else:
            label = f"ID {track_id} - Collecting..."
            color = (200, 200, 0)

        frame_labels[track_id] = label
        frame_colors[track_id] = color

    # --- Decide who to zoom on ---
    zoom_box = None
    if frame_probs:
        # pick highest prob
        cur_speaker = max(frame_probs.items(), key=lambda kv: kv[1])[0]
        cur_prob = frame_probs[cur_speaker]

        if cur_prob > THRESHOLD:
            # if new speaker differs, optionally require few frames (stickiness)
            if last_speaker_id is None or cur_speaker == last_speaker_id or stick_counter <= 0:
                zoom_box = frame_boxes[cur_speaker]
                if cur_speaker != last_speaker_id:
                    last_speaker_id = cur_speaker
                    stick_counter = STICK_FRAMES
            else:
                # keep previous speaker for a few frames
                if last_speaker_id in frame_boxes:
                    zoom_box = frame_boxes[last_speaker_id]
        else:
            # nobody above threshold
            last_speaker_id = None
            stick_counter = 0
    else:
        last_speaker_id = None
        stick_counter = 0

    if stick_counter > 0:
        stick_counter -= 1

    # --- Draw boxes/labels BEFORE zoom (so they scale nicely) ---
    for tid, (x1, y1, x2, y2) in frame_boxes.items():
        color = frame_colors.get(tid, (255, 255, 255))
        label = frame_labels.get(tid, f"ID {tid}")
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 8), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    # --- Zoom to speaker (fallback to full frame if none) ---
    if zoom_box is not None:
        frame = safe_zoom(frame, zoom_box, pad=PAD)

    # write / show
    out.write(frame)
    # cv2.imshow("speaker_detection", frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

cap.release()
out.release()
# cv2.destroyAllWindows()

cannot read


## For Opening Webcam on google colab

In [None]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

# function to convert OpenCV Rectangle bounding box image into base64 byte string to be overlayed on video stream
def bbox_to_bytes(bbox_array):
  """
  Params:
          bbox_array: Numpy array (pixels) containing rectangle to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')
  iobuf = io.BytesIO()
  # format bbox into png for return
  bbox_PIL.save(iobuf, format='png')
  # format return string
  bbox_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return bbox_bytes

In [None]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }

      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

def video_frame(label, bbox):
  # data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  data = eval_js(f'stream_frame({json.dumps(label_html)}, {json.dumps(bbox)})')
  return data

#### Without Zoom

In [None]:
tracker = Sort()
sequence_dict = defaultdict(lambda: deque(maxlen=25))

# Webcam simulation start
video_stream()
label_html = 'Capturing...'
bbox = ''

while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    frame = js_to_image(js_reply["img"])  # RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert to BGR for YOLO

    result_list = face_detector(frame, verbose=False)
    if len(result_list) == 0 or result_list[0].boxes is None:
        continue

    results = result_list[0]
    boxes = results.boxes
    detections = []

    for i in range(len(boxes)):
        box = boxes.xyxy[i]
        conf = boxes.conf[i]
        x1, y1, x2, y2 = map(int, box)
        detections.append([x1, y1, x2, y2, float(conf)])

    tracked_objects = tracker.update(np.array(detections))

    bbox_array = np.zeros([480, 640, 4], dtype=np.uint8)  # For bounding box overlay

    for track in tracked_objects:
        x1, y1, x2, y2, track_id = map(int, track)
        face = frame[y1:y2, x1:x2]

        processed = preprocess_face(face)  # Apply same transform as during training
        sequence = sequence_dict[track_id]
        sequence.append(processed)

        if len(sequence) == 25:
            input_tensor = torch.stack(list(sequence)).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                output = model(input_tensor)
                prob = torch.sigmoid(output).item()

            label = f"Speaking ({prob:.2f})" if prob > THRESHOLD else f"Not Speaking ({prob:.2f})"
            color = (0, 255, 0) if prob > THRESHOLD else (0, 0, 255)
        else:
            label = f"ID {track_id} - Collecting..."
            color = (200, 200, 0)

        cv2.rectangle(bbox_array, (x1, y1), (x2, y2), color, 2)
        cv2.putText(bbox_array, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    bbox_array[:, :, 3] = (bbox_array.max(axis=2) > 0).astype(np.uint8) * 255
    bbox = bbox_to_bytes(bbox_array)  # Send to JS overlay

<IPython.core.display.Javascript object>

  bbox_PIL = PIL.Image.fromarray(bbox_array, 'RGBA')


#### With Zoom

In [None]:
tracker = Sort()
sequence_dict = defaultdict(lambda: deque(maxlen=25))

# Webcam simulation start
video_stream()
label_html = 'Capturing...'
bbox = ''

while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    frame = js_to_image(js_reply["img"])  # RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert to BGR for YOLO

    result_list = face_detector(frame, verbose=False)
    if len(result_list) == 0 or result_list[0].boxes is None:
        continue

    results = result_list[0]
    boxes = results.boxes
    detections = []

    for i in range(len(boxes)):
        box = boxes.xyxy[i]
        conf = boxes.conf[i]
        x1, y1, x2, y2 = map(int, box)
        detections.append([x1, y1, x2, y2, float(conf)])

    tracked_objects = tracker.update(np.array(detections))

    # bbox_array = np.zeros([480, 640, 4], dtype=np.uint8)  # For bounding box overlay

    # After processing all tracks
    # Find the person with the highest speaking prob
    zoom_face = None
    max_prob = 0
    zoom_box = None

    for track in tracked_objects:
        x1, y1, x2, y2, track_id = map(int, track)
        face = frame[y1:y2, x1:x2]
        processed = preprocess_face(face)  # Apply same transform as during training
        sequence = sequence_dict[track_id]
        sequence.append(processed)
        if len(sequence) == 25:
            input_tensor = torch.stack(list(sequence)).unsqueeze(0).to(DEVICE)
            with torch.no_grad():
                output = model(input_tensor)
                prob = torch.sigmoid(output).item()

            if prob > THRESHOLD and prob > max_prob:
                max_prob = prob
                zoom_box = (x1, y1, x2, y2)

            label = f"Speaking ({prob:.2f})" if prob > THRESHOLD else f"Not Speaking ({prob:.2f})"
            color = (0, 255, 0) if prob > THRESHOLD else (0, 0, 255)
        else:
            label = f"ID {track_id} - Collecting..."
            color = (200, 200, 0)

        # cv2.rectangle(bbox_array, (x1, y1), (x2, y2), color, 2)
        # cv2.putText(bbox_array, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # Simulate zoom if a speaker was detected
    if zoom_box:
        x1, y1, x2, y2 = zoom_box

        # Add padding around the face box for better framing
        h, w, _ = frame.shape
        pad = 40
        x1 = max(x1 - pad, 0)
        y1 = max(y1 - pad, 0)
        x2 = min(x2 + pad, w)
        y2 = min(y2 + pad, h)

        # # Crop and resize to full frame size
        # face_roi = frame[y1:y2, x1:x2]
        # zoomed = cv2.resize(face_roi, (640, 480))  # Resize to full frame
        # frame = zoomed  # Replace the original frame with zoomed face

        # # Smooth transition: blend current frame and zoomed face
        # alpha = 0.9  # Adjust for smoothness (0 = only original, 1 = only zoom)
        # frame = cv2.addWeighted(frame, 1 - alpha, zoomed, alpha, 0)

        face_roi = frame[y1:y2, x1:x2]

        # --- Center the zoomed face in the frame ---
        face_h, face_w, _ = face_roi.shape
        zoomed_frame = np.zeros_like(frame)  # black background

        # Resize face ROI to fit a portion of the screen (optional: full 640x480)
        resized_face = cv2.resize(face_roi, (min(640, face_w * 2), min(480, face_h * 2)))

        # Compute top-left corner to center it
        rf_h, rf_w, _ = resized_face.shape
        start_y = (480 - rf_h) // 2
        start_x = (640 - rf_w) // 2

        # Place resized face in center of black frame
        zoomed_frame[start_y:start_y + rf_h, start_x:start_x + rf_w] = resized_face
        frame = zoomed_frame  # Replace original frame

    else:
        # 🛡️ No speaker detected: show original frame
        pass  # Keep full frame (no zoom)

    # --- Convert updated frame to base64 image and send to JS for display ---
    im_pil = Image.fromarray(frame)
    buff = BytesIO()
    im_pil.save(buff, format="jpeg")
    frame_bytes = b64encode(buff.getvalue()).decode("utf-8")
    label_html = f'<img src="data:image/jpeg;base64,{frame_bytes}"/>'

    # bbox_array[:, :, 3] = (bbox_array.max(axis=2) > 0).astype(np.uint8) * 255
    # bbox = bbox_to_bytes(bbox_array)  # Send to JS overlay

<IPython.core.display.Javascript object>