In [None]:
import os
import cv2
import torch
import torchvision
import numpy as np
from torch.utils.data import Dataset, DataLoader
import xml.etree.ElementTree as ET
from collections import defaultdict

# Convert a video into a tensor
def video_to_tensor(video_path, resize=None, frame_skip=1, return_orig_size = False):
    print(f"Loading video: {os.path.basename(video_path)}")
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0
    orig_w = None
    Orig_h = None
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if orig_w is None:
            orig_h, orig_w = frame.shape[:2]
        if resize:
            frame = cv2.resize(frame, resize)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
        frame_count += 1
    cap.release()

    if not frames:
        raise ValueError(f"No frames read from {video_path}")

    frames = np.stack(frames)[::frame_skip]
    frames = torch.from_numpy(frames).float().permute(0, 3, 1, 2) / 255.0
    print(f"Loaded {frame_count} frames -> kept {frames.shape[0]} after skipping\n")

    if return_orig_size:
        return frames, (orig_w, orig_h)
    else:
        return frames  # shape: (T, 3, H, W)

# Parse CVAT XML annotation file
def parse_cvat_xml(xml_path, frame_skip=1, scale=None):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    annotations = defaultdict(list)

    if scale is None:
        sx = sy = 1.0
    else:
        sx, sy = scale

    for track in root.findall("track"):
        label = track.attrib["label"]
        for box in track.findall("box"):
            frame = int(box.attrib["frame"])
            outside = int(box.attrib["outside"])
            if outside != 0:
                continue

            xtl = float(box.attrib["xtl"])
            ytl = float(box.attrib["ytl"])
            xbr = float(box.attrib["xbr"])
            ybr = float(box.attrib["ybr"])

            #scale bbox into resized frame coordinates if scale != 1
            xtl *= sx
            xbr *= sx
            ytl *= sy
            ybr *= sy

            moving_attr = None
            for attr in box.findall("attribute"):
                if attr.attrib.get("name", "").lower() == "moving":
                    moving_attr = attr
                    break

            if moving_attr is None:
                raise ValueError(f"Missing 'moving' attribute in file {xml_path}, track '{label}', frame {frame}")

            moving_flag = 1 if moving_attr.text.lower() == "true" else 0

            #frame skip alignment
            if frame % frame_skip == 0:
                adjusted_frame = frame // frame_skip
                annotations[adjusted_frame].append({
                    "label": label,
                    "bbox": [xtl, ytl, xbr, ybr],
                    "moving": moving_flag})

    return annotations

# Dataset class that loads videos + XML annotations together
class BaseballVideoDataset(Dataset):
    def __init__(self, video_dir, xml_dir, resize=(1280, 720), frame_skip=1, scale_boxes=True):
        self.video_dir = video_dir
        self.xml_dir = xml_dir
        self.resize = resize
        self.frame_skip = frame_skip
        self.scale_boxes = scale_boxes
        self.video_tensors = {}
        self.skipped_videos = []
        self.index_map = []

        # Match videos with their annotation XMLs by filename stem
        self.samples = []
        for vid_name in os.listdir(video_dir):
            if vid_name.lower().endswith((".mp4", ".mov", ".avi")):
                stem = os.path.splitext(vid_name)[0]
                xml_path = os.path.join(xml_dir, f"{stem}.xml")
                if os.path.exists(xml_path):
                    self.samples.append((os.path.join(video_dir, vid_name), xml_path))
                else:
                    print(f"No XML found for {vid_name}")
        print(f"\n Found {len(self.samples)} videos with matching XMLs in {video_dir}\n")

        print("Preloading videos and indexing frames...\n")

        # Loop through all matched video/XML pairs
        for vid_idx, (video_path, xml_path) in enumerate(self.samples, start=1):
            try:
                video_tensor, (orig_w, orig_h) = video_to_tensor(video_path, resize=self.resize, frame_skip=self.frame_skip, return_orig_size = True)

            #compute scale factors for bboxes
                if self.resize is not None and self.scale_boxes:
                    new_w, new_h = self.resize
                    sx = new_w/float(orig_w)
                    sy = new_h/float(orig_h)
                    scale = (sx, sy)
                else:
                    scale = None

                #parse annotations in scaled coordinates
                annotations = parse_cvat_xml(xml_path, frame_skip = self.frame_skip, scale = scale)
                self.video_tensors[video_path] = video_tensor

            except Exception as e:
                print(f"Skipping {os.path.basename(video_path)}: {e}")
                self.skipped_videos.append((video_path, str(e)))
                continue

            # Build frame-by-frame index map
            for frame_idx, ann_list in annotations.items():
                if len(ann_list) == 0:
                    continue
                if frame_idx >= len(video_tensor):
                    print(f"Frame {frame_idx} out of range for {os.path.basename(video_path)} "
                          f"(video has {len(video_tensor)} frames) — skipping")
                    continue

                boxes = torch.tensor([a["bbox"] for a in ann_list], dtype=torch.float32)
                moving = torch.tensor([a["moving"] for a in ann_list], dtype=torch.int64)

                self.index_map.append((video_path, frame_idx, {"boxes": boxes,"moving": moving}))

            print(f"   [{vid_idx}/{len(self.samples)}] Loaded {os.path.basename(video_path)} "
                  f"({len(video_tensor)} frames, {len(annotations)} annotated)\n")

        print(f"Finished indexing {len(self.index_map)} annotated frames "
              f"from {len(self.video_tensors)} videos.\n")

    def __len__(self):
        return len(self.index_map)

    def __getitem__(self, idx):
        video_path, frame_idx, target = self.index_map[idx]

        video_tensor = self.video_tensors[video_path]
        frame_tensor = video_tensor[frame_idx]  # (3, H, W)

        #convert flags to int, and configure labels for model interpretation
        moving_flags = target["moving"].to(torch.int64)
        labels = moving_flags + 1

        target_out = {
            "boxes": target["boxes"],
            "moving": moving_flags,
            "labels": labels,
            "video": os.path.basename(video_path)}

        return frame_tensor, target_out

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#define the model, adjustable number of classes, not pretrained
def get_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
    return model

def train_loop(model, dataloader, optimizer, device):
    #set model to training mode
    model.train()
    total_loss = 0.0
    for batch_idx, (images, targets) in enumerate(dataloader):
            images = [img.to(device) for img in images]
            for t in targets:
                t["boxes"] = t["boxes"].to(device)
                t["labels"] = t["labels"].to(device)

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            #clear previous gradients
            optimizer.zero_grad()
            #compute new gradients via backpropagation
            losses.backward()
            #update model weights
            optimizer.step()

            total_loss += losses.item()
            if batch_idx % 5 == 0:
                print(f"Batch {batch_idx}/{len(dataloader)} | Loss: {losses.item():.4f}")

    avg_loss = total_loss/len(dataloader)
    print(f"Average Training LossL: {avg_loss:.4f}")
    return avg_loss

#evaluate the model
@torch.no_grad() #prevent gradient updates
def test_loop(model, dataloader, device):
    model.train()
    total_loss = 0.0
    for images, targets in dataloader:
        images = [img.to(device) for img in images]
        for t in targets:
            t["boxes"] = t["boxes"].to(device)
            t["labels"] = t["labels"].to(device)

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        total_loss += losses.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Validation Loss: {avg_loss:.4f}")
    return avg_loss

#Function to measure model accuracy
@torch.no_grad()
def accuracy_loop(model, dataloader, device, score_thresh=0.5):
    model.eval()
    num_frames = 0
    num_correct = 0

    for images, targets in dataloader:
        images = [img.to(device) for img in images]
        # move labels to device just in case
        for t in targets:
            t["labels"] = t["labels"].to(device)

        predictions = model(images)

        for pred, tgt in zip(predictions, targets):
            # Ground truth: does this frame contain any moving object?
            gt_any_moving = (tgt["labels"] == 2).any().item()

            # Predictions: keep only boxes above a confidence threshold
            scores = pred["scores"].to(device)
            labels = pred["labels"].to(device)
            keep = scores >= score_thresh
            pred_labels = labels[keep]

            pred_any_moving = (pred_labels == 2).any().item()

            num_frames += 1
            if bool(gt_any_moving) == bool(pred_any_moving):
                num_correct += 1

    acc = num_correct / num_frames if num_frames > 0 else 0.0
    print(f"Frame-level moving/not-moving accuracy: {acc*100:.2f}% "
          f"(threshold={score_thresh})")
    return acc

def train_detector(train_dataset, val_dataset, num_classes=2, epochs=5, lr=1e-4, batch_size=4):
    #use gpu if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    model = get_model(num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}\n----------------------------")
        train_loss = train_loop(model, train_loader, optimizer, device)
        val_loss = test_loop(model, val_loader, device)
        val_acc  = accuracy_loop(model, val_loader, device, score_thresh=0.5)
        print(f"Summary: train_loss={train_loss:.4f}, " f"val_loss={val_loss:.4f}, val_acc={val_acc*100:.2f}%")

    return model

In [None]:
#train and save trained model
if __name__ == "__main__":
    video_folder = r"/content/drive/MyDrive/Raw Videos" #Change this when swapping devices to your local Raw Video folder
    xml_folder   = r"/content/drive/MyDrive/Annotations" #

    #Collate function for object detection
    def collate_fn(batch):
        batch = [b for b in batch if b is not None]
        frames = [b[0] for b in batch]
        targets = [b[1] for b in batch]
        return frames, targets

    #Load full dataset (videos + XMLs)
    full_dataset = BaseballVideoDataset(video_folder, xml_folder, resize=(1280, 720), frame_skip=1, scale_boxes=True)

    #print a quick summary
    print(f"\nDataset contains {len(full_dataset)} annotated frames across videos.")

    #Split train/test
    n = len(full_dataset)
    split = int(0.8 * n)
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [split, n - split])

    #Train the model
    trained_model = train_detector(
        train_dataset,
        val_dataset,
        num_classes=3,
        epochs=5,
        lr=1e-4,
        batch_size=4)

    #Save trained model
    torch.save(trained_model.state_dict(), "fasterrcnn_moving_detector_2.0.pth")
    print("\nModel saved as 'fasterrcnn_moving_detector_2.pth'.")

No XML found for IMG_8226_jared.mov
No XML found for IMG_8124_joe.mov
No XML found for IMG_8028_joel.mov
No XML found for IMG_8139_joe.mov
No XML found for IMG_8123_scott.mov
No XML found for IMG_8027_joel.mov
No XML found for IMG_8243_jared.mov
No XML found for IMG_8063_scott.mov
No XML found for IMG_8030_patrick.mov
No XML found for IMG_8255_zach.mov
No XML found for IMG_8121_scott.mov
No XML found for IMG_8138_joe.mov
No XML found for IMG_8242_jared.mov
No XML found for IMG_8241_jared.mov
No XML found for IMG_8140_joe.mov
No XML found for IMG_8122_scott.mov
No XML found for IMG_8029_joel.mov
No XML found for IMG_7999_joel.mov
No XML found for IMG_8252_zach.mov
No XML found for IMG_8923_souleymane.mov
No XML found for IMG_8947_souleymane.mov
No XML found for IMG_8257_zach.mov
No XML found for IMG_9198_joel.mov
No XML found for IMG_8256_zach.mov
No XML found for IMG_8924_souleymane.mov
No XML found for IMG_8946_souleymane.mov

 Found 21 videos with matching XMLs in /content/drive/MyDr



Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


100%|██████████| 160M/160M [00:00<00:00, 215MB/s]



Epoch 1/5
----------------------------
Batch 0/239 | Loss: 4.2637
Batch 5/239 | Loss: 0.6494
Batch 10/239 | Loss: 0.8681
Batch 15/239 | Loss: 0.4860
Batch 20/239 | Loss: 0.6403
Batch 25/239 | Loss: 0.4103
Batch 30/239 | Loss: 0.5687
Batch 35/239 | Loss: 0.5541
Batch 40/239 | Loss: 0.2250
Batch 45/239 | Loss: 0.3853
Batch 50/239 | Loss: 0.2741
Batch 55/239 | Loss: 0.4839
Batch 60/239 | Loss: 0.3406
Batch 65/239 | Loss: 0.8450
Batch 70/239 | Loss: 0.1846
Batch 75/239 | Loss: 0.5465
Batch 80/239 | Loss: 0.6376
Batch 85/239 | Loss: 0.4792
Batch 90/239 | Loss: 0.4808
Batch 95/239 | Loss: 0.3720
Batch 100/239 | Loss: 0.5374
Batch 105/239 | Loss: 0.5006
Batch 110/239 | Loss: 0.1673
Batch 115/239 | Loss: 0.5370
Batch 120/239 | Loss: 0.4840
Batch 125/239 | Loss: 0.5427
Batch 130/239 | Loss: 0.4162
Batch 135/239 | Loss: 0.5092
Batch 140/239 | Loss: 0.3625
Batch 145/239 | Loss: 0.2212
Batch 150/239 | Loss: 0.4685
Batch 155/239 | Loss: 0.3553
Batch 160/239 | Loss: 0.5177
Batch 165/239 | Loss: 0.5

In [None]:
#train and save trained model
if __name__ == "__main__":
    video_folder = r"/content/drive/MyDrive/Raw Videos" #Change this when swapping devices to your local Raw Video folder
    xml_folder   = r"/content/drive/MyDrive/Annotations"

    #Collate function for object detection
    def collate_fn(batch):
        batch = [b for b in batch if b is not None]
        frames = [b[0] for b in batch]
        targets = [b[1] for b in batch]
        return frames, targets

    #Load full dataset (videos + XMLs)
    full_dataset = BaseballVideoDataset(video_folder, xml_folder, resize=(1280, 720), frame_skip=1, scale_boxes=True)

    #print a quick summary
    print(f"\nDataset contains {len(full_dataset)} annotated frames across videos.")

    #Split train/test
    n = len(full_dataset)
    split = int(0.8 * n)
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [split, n - split])

    #Train the model
    trained_model = train_detector(
        train_dataset,
        val_dataset,
        num_classes=3,
        epochs=5,
        lr=1e-4,
        batch_size=4)

    #Save trained model
    torch.save(trained_model.state_dict(), "fasterrcnn_moving_detector_2.0.pth")
    print("\nModel saved as 'fasterrcnn_moving_detector_2.pth'.")

No XML found for IMG_8226_jared.mov
No XML found for IMG_8124_joe.mov
No XML found for IMG_8028_joel.mov
No XML found for IMG_8139_joe.mov
No XML found for IMG_8123_scott.mov
No XML found for IMG_8027_joel.mov
No XML found for IMG_8243_jared.mov
No XML found for IMG_8063_scott.mov
No XML found for IMG_8030_patrick.mov
No XML found for IMG_8255_zach.mov
No XML found for IMG_8121_scott.mov
No XML found for IMG_8138_joe.mov
No XML found for IMG_8242_jared.mov
No XML found for IMG_8241_jared.mov
No XML found for IMG_8140_joe.mov
No XML found for IMG_8122_scott.mov
No XML found for IMG_8029_joel.mov
No XML found for IMG_7999_joel.mov
No XML found for IMG_8252_zach.mov
No XML found for IMG_8923_souleymane.mov
No XML found for IMG_8947_souleymane.mov
No XML found for IMG_8257_zach.mov
No XML found for IMG_9198_joel.mov
No XML found for IMG_8256_zach.mov
No XML found for IMG_8924_souleymane.mov
No XML found for IMG_8946_souleymane.mov

 Found 21 videos with matching XMLs in /content/drive/MyDr



Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


100%|██████████| 160M/160M [00:00<00:00, 215MB/s]



Epoch 1/5
----------------------------
Batch 0/239 | Loss: 4.2637
Batch 5/239 | Loss: 0.6494
Batch 10/239 | Loss: 0.8681
Batch 15/239 | Loss: 0.4860
Batch 20/239 | Loss: 0.6403
Batch 25/239 | Loss: 0.4103
Batch 30/239 | Loss: 0.5687
Batch 35/239 | Loss: 0.5541
Batch 40/239 | Loss: 0.2250
Batch 45/239 | Loss: 0.3853
Batch 50/239 | Loss: 0.2741
Batch 55/239 | Loss: 0.4839
Batch 60/239 | Loss: 0.3406
Batch 65/239 | Loss: 0.8450
Batch 70/239 | Loss: 0.1846
Batch 75/239 | Loss: 0.5465
Batch 80/239 | Loss: 0.6376
Batch 85/239 | Loss: 0.4792
Batch 90/239 | Loss: 0.4808
Batch 95/239 | Loss: 0.3720
Batch 100/239 | Loss: 0.5374
Batch 105/239 | Loss: 0.5006
Batch 110/239 | Loss: 0.1673
Batch 115/239 | Loss: 0.5370
Batch 120/239 | Loss: 0.4840
Batch 125/239 | Loss: 0.5427
Batch 130/239 | Loss: 0.4162
Batch 135/239 | Loss: 0.5092
Batch 140/239 | Loss: 0.3625
Batch 145/239 | Loss: 0.2212
Batch 150/239 | Loss: 0.4685
Batch 155/239 | Loss: 0.3553
Batch 160/239 | Loss: 0.5177
Batch 165/239 | Loss: 0.5

In [None]:
#train and save trained model
if __name__ == "__main__":
    video_folder = r"/content/drive/MyDrive/Raw Videos" #Change this when swapping devices to your local Raw Video folder
    xml_folder   = r"/content/drive/MyDrive/Annotations" #

    #Collate function for object detection
    def collate_fn(batch):
        batch = [b for b in batch if b is not None]
        frames = [b[0] for b in batch]
        targets = [b[1] for b in batch]
        return frames, targets

    #Load full dataset (videos + XMLs)
    full_dataset = BaseballVideoDataset(video_folder, xml_folder, resize=(1280, 720), frame_skip=1, scale_boxes=True)

    #print a quick summary
    print(f"\nDataset contains {len(full_dataset)} annotated frames across videos.")

    #Split train/test
    n = len(full_dataset)
    split = int(0.8 * n)
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [split, n - split])

    #Train the model
    trained_model = train_detector(
        train_dataset,
        val_dataset,
        num_classes=3,
        epochs=5,
        lr=1e-4,
        batch_size=4)

    #Save trained model
    torch.save(trained_model.state_dict(), "fasterrcnn_moving_detector_2.0.pth")
    print("\nModel saved as 'fasterrcnn_moving_detector_2.pth'.")

No XML found for IMG_8226_jared.mov
No XML found for IMG_8124_joe.mov
No XML found for IMG_8028_joel.mov
No XML found for IMG_8139_joe.mov
No XML found for IMG_8123_scott.mov
No XML found for IMG_8027_joel.mov
No XML found for IMG_8243_jared.mov
No XML found for IMG_8063_scott.mov
No XML found for IMG_8030_patrick.mov
No XML found for IMG_8255_zach.mov
No XML found for IMG_8121_scott.mov
No XML found for IMG_8138_joe.mov
No XML found for IMG_8242_jared.mov
No XML found for IMG_8241_jared.mov
No XML found for IMG_8140_joe.mov
No XML found for IMG_8122_scott.mov
No XML found for IMG_8029_joel.mov
No XML found for IMG_7999_joel.mov
No XML found for IMG_8252_zach.mov
No XML found for IMG_8923_souleymane.mov
No XML found for IMG_8947_souleymane.mov
No XML found for IMG_8257_zach.mov
No XML found for IMG_9198_joel.mov
No XML found for IMG_8256_zach.mov
No XML found for IMG_8924_souleymane.mov
No XML found for IMG_8946_souleymane.mov

 Found 21 videos with matching XMLs in /content/drive/MyDr



Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


100%|██████████| 160M/160M [00:00<00:00, 215MB/s]



Epoch 1/5
----------------------------
Batch 0/239 | Loss: 4.2637
Batch 5/239 | Loss: 0.6494
Batch 10/239 | Loss: 0.8681
Batch 15/239 | Loss: 0.4860
Batch 20/239 | Loss: 0.6403
Batch 25/239 | Loss: 0.4103
Batch 30/239 | Loss: 0.5687
Batch 35/239 | Loss: 0.5541
Batch 40/239 | Loss: 0.2250
Batch 45/239 | Loss: 0.3853
Batch 50/239 | Loss: 0.2741
Batch 55/239 | Loss: 0.4839
Batch 60/239 | Loss: 0.3406
Batch 65/239 | Loss: 0.8450
Batch 70/239 | Loss: 0.1846
Batch 75/239 | Loss: 0.5465
Batch 80/239 | Loss: 0.6376
Batch 85/239 | Loss: 0.4792
Batch 90/239 | Loss: 0.4808
Batch 95/239 | Loss: 0.3720
Batch 100/239 | Loss: 0.5374
Batch 105/239 | Loss: 0.5006
Batch 110/239 | Loss: 0.1673
Batch 115/239 | Loss: 0.5370
Batch 120/239 | Loss: 0.4840
Batch 125/239 | Loss: 0.5427
Batch 130/239 | Loss: 0.4162
Batch 135/239 | Loss: 0.5092
Batch 140/239 | Loss: 0.3625
Batch 145/239 | Loss: 0.2212
Batch 150/239 | Loss: 0.4685
Batch 155/239 | Loss: 0.3553
Batch 160/239 | Loss: 0.5177
Batch 165/239 | Loss: 0.5

In [None]:
#train and save trained model
if __name__ == "__main__":
    video_folder = r"/content/drive/MyDrive/Raw Videos" #Change this when swapping devices to your local Raw Video folder
    xml_folder   = r"/content/drive/MyDrive/Annotations" #Change this when swapping devices to your local Annotations folder

    #Collate function for object detection
    def collate_fn(batch):
        batch = [b for b in batch if b is not None]
        frames = [b[0] for b in batch]
        targets = [b[1] for b in batch]
        return frames, targets

    #Load full dataset (videos + XMLs)
    full_dataset = BaseballVideoDataset(video_folder, xml_folder, resize=(1280, 720), frame_skip=1, scale_boxes=True)

    #print a quick summary
    print(f"\nDataset contains {len(full_dataset)} annotated frames across videos.")

    #Split train/test
    n = len(full_dataset)
    split = int(0.8 * n)
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [split, n - split])

    #Train the model
    trained_model = train_detector(
        train_dataset,
        val_dataset,
        num_classes=3,
        epochs=5,
        lr=1e-4,
        batch_size=4)

    #Save trained model
    torch.save(trained_model.state_dict(), "fasterrcnn_moving_detector_2.0.pth")
    print("\nModel saved as 'fasterrcnn_moving_detector_2.pth'.")

No XML found for IMG_8226_jared.mov
No XML found for IMG_8124_joe.mov
No XML found for IMG_8028_joel.mov
No XML found for IMG_8139_joe.mov
No XML found for IMG_8123_scott.mov
No XML found for IMG_8027_joel.mov
No XML found for IMG_8243_jared.mov
No XML found for IMG_8063_scott.mov
No XML found for IMG_8030_patrick.mov
No XML found for IMG_8255_zach.mov
No XML found for IMG_8121_scott.mov
No XML found for IMG_8138_joe.mov
No XML found for IMG_8242_jared.mov
No XML found for IMG_8241_jared.mov
No XML found for IMG_8140_joe.mov
No XML found for IMG_8122_scott.mov
No XML found for IMG_8029_joel.mov
No XML found for IMG_7999_joel.mov
No XML found for IMG_8252_zach.mov
No XML found for IMG_8923_souleymane.mov
No XML found for IMG_8947_souleymane.mov
No XML found for IMG_8257_zach.mov
No XML found for IMG_9198_joel.mov
No XML found for IMG_8256_zach.mov
No XML found for IMG_8924_souleymane.mov
No XML found for IMG_8946_souleymane.mov

 Found 21 videos with matching XMLs in /content/drive/MyDr



Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


100%|██████████| 160M/160M [00:00<00:00, 215MB/s]



Epoch 1/5
----------------------------
Batch 0/239 | Loss: 4.2637
Batch 5/239 | Loss: 0.6494
Batch 10/239 | Loss: 0.8681
Batch 15/239 | Loss: 0.4860
Batch 20/239 | Loss: 0.6403
Batch 25/239 | Loss: 0.4103
Batch 30/239 | Loss: 0.5687
Batch 35/239 | Loss: 0.5541
Batch 40/239 | Loss: 0.2250
Batch 45/239 | Loss: 0.3853
Batch 50/239 | Loss: 0.2741
Batch 55/239 | Loss: 0.4839
Batch 60/239 | Loss: 0.3406
Batch 65/239 | Loss: 0.8450
Batch 70/239 | Loss: 0.1846
Batch 75/239 | Loss: 0.5465
Batch 80/239 | Loss: 0.6376
Batch 85/239 | Loss: 0.4792
Batch 90/239 | Loss: 0.4808
Batch 95/239 | Loss: 0.3720
Batch 100/239 | Loss: 0.5374
Batch 105/239 | Loss: 0.5006
Batch 110/239 | Loss: 0.1673
Batch 115/239 | Loss: 0.5370
Batch 120/239 | Loss: 0.4840
Batch 125/239 | Loss: 0.5427
Batch 130/239 | Loss: 0.4162
Batch 135/239 | Loss: 0.5092
Batch 140/239 | Loss: 0.3625
Batch 145/239 | Loss: 0.2212
Batch 150/239 | Loss: 0.4685
Batch 155/239 | Loss: 0.3553
Batch 160/239 | Loss: 0.5177
Batch 165/239 | Loss: 0.5

In [None]:
#link to .pth file in one drive, was too large to upload to github
#https://uofnebraska-my.sharepoint.com/:u:/g/personal/51628718_nebraska_edu/IQCJ0x1cdpvdQJXmvP5vf1XLAcN6AUHbCMoGgel_l8_oEMM?e=qqaj9l

#import script for trained model
def load_trained_model(weights_path, num_classes=3, device=None):

    # Select device
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Loading model on device: {device}")

    # Recreate model architecture
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    # Load saved weights
    state_dict = torch.load(weights_path, map_location=device)
    model.load_state_dict(state_dict)

    # Move model to device and set eval mode
    model.to(device)
    model.eval()

    print(f"Model loaded successfully from '{weights_path}'")
    return model

In [None]:
!pip install -q torch torchvision torchaudio
!pip install -q opencv-python-headless
!pip install -q pillow
!pip install -q numpy

In [None]:
import os
import cv2
import torch
import torchvision
import numpy as np
from torch.utils.data import Dataset, DataLoader
import xml.etree.ElementTree as ET
from collections import defaultdict
from PIL import Image
from google.colab.patches import cv2_imshow   # ← this line enables cv2_imshow in Colab
import torchvision.transforms as T
from google.colab import files

# Provided function to load the model
def load_trained_model(weights_path, num_classes=3, device=None):
    # Select device
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Loading model on device: {device}")

    # Recreate model architecture
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    # Load saved weights
    state_dict = torch.load(weights_path, map_location=device)
    model.load_state_dict(state_dict)

    # Move model to device and set eval mode
    model.to(device)
    model.eval()

    print(f"Model loaded successfully from '{weights_path}'")
    return model

# Function to process the video and draw boxes around moving baseball
def process_video(video_path, output_path, weights_path="fasterrcnn_moving_detector.pth"):
    # Load the model
    model = load_trained_model(weights_path)

    # Open the input video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error opening video file: {video_path}")
        return

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    print(f"Original video size: {orig_width}x{orig_height}, FPS: {fps}")

    # Define resized dimensions
    new_width = 1280
    new_height = 720

    # Create output video writer (using mp4v codec for .mp4; change to 'MOV ' if needing .mov)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(output_path, fourcc, fps, (new_width, new_height))

    # Device from model
    device = next(model.parameters()).device

    # Preprocessing transforms
    transform = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        print(f"Processing frame {frame_count}...")

        # Resize the frame
        frame_resized = cv2.resize(frame, (new_width, new_height))

        # Convert to RGB and PIL Image
        rgb_frame = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb_frame)

        # Apply transforms and move to device
        image_tensor = transform(pil_image).to(device)

        # Run inference
        with torch.no_grad():
            predictions = model([image_tensor])[0]

        # Extract predictions
        boxes = predictions['boxes'].cpu().numpy()
        labels = predictions['labels'].cpu().numpy()
        scores = predictions['scores'].cpu().numpy()

        # Draw boxes for moving baseball (assume label 2 = moving; adjust if needed)
        for i in range(len(scores)):
            if scores[i] > 0.5 and labels[i] == 2:
                x1, y1, x2, y2 = map(int, boxes[i])
                # Draw red rectangle (BGR color)
                cv2.rectangle(frame_resized, (x1, y1), (x2, y2), (0, 0, 255), thickness=2)

        # Write the annotated frame to output
        writer.write(frame_resized)

    # Release resources
    cap.release()
    writer.release()
    print(f"Processing complete. Output saved to {output_path}")

In [None]:
def process_video_colab2(
                       video_path,
                       output_path="output_with_boxes_v2.mp4",
                       weights_path="fasterrcnn_moving_detector_2.pth",
                       general_conf_threshold=0.3,
                       thrown_ball_label=3,
                       thrown_ball_min_score=0.35): #thrown_ball_min_score to make the detection more sensitive.

    # Load model (your existing function)
    model = load_trained_model(weights_path, num_classes=3)
    device = next(model.parameters()).device
    print(f"Model loaded on {device}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video")
        return

    fps = cap.get(cv2.CAP_PROP_FPS)
    # Get original video dimensions
    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Use original dimensions for output video
    width  = orig_width
    height = orig_height

    # Video writer to save the result
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    transform = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    print("Starting processing... We will display every frame and print detection info if a thrown ball is found.")

    frame_idx = 0
    found_thrown_ball = False

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_idx += 1
        # Do not resize the frame, use original
        display_frame = frame.copy() # Use original frame for display and writing

        # Prepare input (use original frame for inference as well)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb)
        img_tensor = transform(pil_image).unsqueeze(0).to(device)

        # Inference
        with torch.no_grad():
            pred = model(img_tensor)[0]

        boxes  = pred['boxes'].cpu().numpy()
        labels = pred['labels'].cpu().numpy()
        scores = pred['scores'].cpu().numpy()

        thrown_ball_detections_this_frame = []

        for i, (box, label, score) in enumerate(zip(boxes, labels, scores)):
            # Apply general confidence threshold for drawing all boxes
            if score > general_conf_threshold:
                x1, y1, x2, y2 = map(int, box)

                color = (0, 255, 0) # Default green for other detections
                label_text = f"Class {label} ({score:.2f})"

                if label == thrown_ball_label:
                    color = (0, 0, 255) # Red for thrown ball
                    if score > thrown_ball_min_score:
                        thrown_ball_detections_this_frame.append((box, score))
                        found_thrown_ball = True

                # Draw rectangle and put text for all confident detections
                cv2.rectangle(display_frame, (x1, y1), (x2, y2), color, 3)
                cv2.putText(display_frame, label_text, (x1, y1-8),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)

        # Display every frame
        cv2_imshow(display_frame)

        # Print detection info only if a thrown ball is found in this frame
        if thrown_ball_detections_this_frame:
            # Get max score for the thrown ball label in this frame
            max_thrown_ball_score = max([s for _, s in thrown_ball_detections_this_frame] or [0])
            print(f"Frame {frame_idx:4d} → Thrown ball (Class {thrown_ball_label}) detected with confidence: {max_thrown_ball_score:.3f}")
            print("-" * 60)

        # Always write frame to output video
        writer.write(display_frame)

    cap.release()
    writer.release()
    print(f"\nFinished! Video saved as → {output_path}")
    if found_thrown_ball:
        print("Thrown ball (Class 2) successfully detected in some frames!")
    else:
        print("Warning: No confident thrown ball (Class 2) found with the given thresholds. Try lowering `thrown_ball_min_score` or check if `thrown_ball_label` is correct.")

In [None]:
# Example call for the new function
process_video_colab2(
    video_path="/content/drive/MyDrive/Raw Videos/IMG_8060_patrick.mov",
    output_path="IMG_8060.Box.mp4",
    weights_path="/content/drive/MyDrive/fasterrcnn_moving_detector_2.0.pth",
    general_conf_threshold=0.5,  # General threshold for any detection
    thrown_ball_label=2,          # class for the 'thrown ball'
    thrown_ball_min_score=0.5    # Specific confidence for the thrown ball
)