need to correct scaling
- scaling videos but not annotations

In [4]:
import os
import cv2
import torch
import torchvision
import numpy as np
from torch.utils.data import Dataset, DataLoader
import xml.etree.ElementTree as ET
from collections import defaultdict

# Convert a video into a tensor
def video_to_tensor(video_path, resize=None, frame_skip=1):
    print(f"Loading video: {os.path.basename(video_path)}")
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if resize:
            frame = cv2.resize(frame, resize)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
        frame_count += 1
    cap.release()

    if not frames:
        raise ValueError(f"No frames read from {video_path}")

    frames = np.stack(frames)[::frame_skip]
    frames = torch.from_numpy(frames).float().permute(0, 3, 1, 2) / 255.0
    print(f"Loaded {frame_count} frames -> kept {frames.shape[0]} after skipping\n")
    return frames  # shape: (T, 3, H, W)

# Parse CVAT XML annotation file
def parse_cvat_xml(xml_path, frame_skip=1):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    annotations = defaultdict(list)

    for track in root.findall("track"):
        label = track.attrib["label"]
        for box in track.findall("box"):
            frame = int(box.attrib["frame"])
            outside = int(box.attrib["outside"])
            if outside != 0:
                continue

            xtl = float(box.attrib["xtl"])
            ytl = float(box.attrib["ytl"])
            xbr = float(box.attrib["xbr"])
            ybr = float(box.attrib["ybr"])

            moving_attr = box.find("attribute[@name='moving']")
            if moving_attr is None:
                raise ValueError(f"Missing 'moving' attribute in file {xml_path}, track '{label}', frame {frame}")

            moving_flag = 1 if moving_attr.text.lower() == "true" else 0

            if frame % frame_skip == 0:
                adjusted_frame = frame // frame_skip
                annotations[adjusted_frame].append({
                    "label": label,
                    "bbox": [xtl, ytl, xbr, ybr],
                    "moving": moving_flag})

    return annotations

# Dataset class that loads videos + XML annotations together
class BaseballVideoDataset(Dataset):
    def __init__(self, video_dir, xml_dir, resize=(1280, 720), frame_skip=1):
        self.video_dir = video_dir
        self.xml_dir = xml_dir
        self.resize = resize
        self.frame_skip = frame_skip
        self.video_tensors = {}
        self.skipped_videos = []
        self.index_map = []

        # Match videos with their annotation XMLs by filename stem
        self.samples = []
        for vid_name in os.listdir(video_dir):
            if vid_name.lower().endswith((".mp4", ".mov", ".avi")):
                stem = os.path.splitext(vid_name)[0]
                xml_path = os.path.join(xml_dir, f"{stem}.xml")
                if os.path.exists(xml_path):
                    self.samples.append((os.path.join(video_dir, vid_name), xml_path))
                else:
                    print(f"No XML found for {vid_name}")
        print(f"\n Found {len(self.samples)} videos with matching XMLs in {video_dir}\n")

        print("Preloading videos and indexing frames...\n")

        # Loop through all matched video/XML pairs
        for vid_idx, (video_path, xml_path) in enumerate(self.samples, start=1):
            try:
                video_tensor = video_to_tensor(video_path, resize=self.resize, frame_skip=self.frame_skip)
                annotations = parse_cvat_xml(xml_path, frame_skip=self.frame_skip)
                self.video_tensors[video_path] = video_tensor

            except Exception as e:
                print(f"Skipping {os.path.basename(video_path)}: {e}")
                self.skipped_videos.append((video_path, str(e)))
                continue

            # Build index map (frame-by-frame)
            for frame_idx, ann_list in annotations.items():
                if len(ann_list) == 0:
                    continue
                if frame_idx >= len(video_tensor):
                    print(f"Frame {frame_idx} out of range for {os.path.basename(video_path)} "
                          f"(video has {len(video_tensor)} frames) — skipping")
                    continue

                boxes = torch.tensor([a["bbox"] for a in ann_list], dtype=torch.float32)
                moving = torch.tensor([a["moving"] for a in ann_list], dtype=torch.int64)

                self.index_map.append((video_path, frame_idx, {
                    "boxes": boxes,
                    "moving": moving}))

            print(f"   [{vid_idx}/{len(self.samples)}] Loaded {os.path.basename(video_path)} "
                  f"({len(video_tensor)} frames, {len(annotations)} annotated)\n")

        print(f"Finished indexing {len(self.index_map)} annotated frames "
              f"from {len(self.video_tensors)} videos.\n")

    def __len__(self):
        return len(self.index_map)

    def __getitem__(self, idx):
        video_path, frame_idx, target = self.index_map[idx]

        video_tensor = self.video_tensors[video_path]
        frame_tensor = video_tensor[frame_idx]  # (3, H, W)

        moving_flags = target["moving"]
        labels = moving_flags.clone()
        labels[labels == 0] = 1
        labels[labels == 1] = 2

        target["labels"] = labels
        target["video"] = os.path.basename(video_path)

        return frame_tensor, target


In [None]:
#define the model, adjustable number of classes, not pretrained
def get_model(num_classes):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained = False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
    return model

def train_loop(model, dataloader, optimizer, device):
    #set model to training mode
    model.train()
    total_loss = 0.0
    for batch_idx, (images, targets) in enumerate(dataloader):
            images = [img.to(device) for img in images]
            for t in targets:
                t["boxes"] = t["boxes"].to(device)
                t["labels"] = t["labels"].to(device)

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            #clear previous gradients
            optimizer.zero_grad()
            #compute new gradients via backpropagation
            losses.backward()
            #update model weights
            optimizer.step()

            total_loss += losses.item()
            if batch_idx % 5 == 0:
                print(f"Batch {batch_idx}/{len(dataloader)} | Loss: {losses.item():.4f}")

    avg_loss = total_loss/len(dataloader)
    print(f"Average Training LossL: {avg_loss:.4f}")
    return avg_loss

#evaluate the model
@torch.no_grad() #prevent gradient updates
def test_loop(model, dataloader, device):
    model.train()
    total_loss = 0.0
    for images, targets in dataloader:
        images = [img.to(device) for img in images]
        for t in targets:
            t["boxes"] = t["boxes"].to(device)
            t["labels"] = t["labels"].to(device)

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        total_loss += losses.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Validation Loss: {avg_loss:.4f}")
    return avg_loss

#Function to measure model accuracy
@torch.no_grad()
def accuracy_loop(model, dataloader, device, score_thresh=0.5):
    model.eval()
    num_frames = 0
    num_correct = 0

    for images, targets in dataloader:
        images = [img.to(device) for img in images]
        # move labels to device just in case
        for t in targets:
            t["labels"] = t["labels"].to(device)

        predictions = model(images)

        for pred, tgt in zip(predictions, targets):
            # Ground truth: does this frame contain any moving object?
            gt_any_moving = (tgt["labels"] == 2).any().item()

            # Predictions: keep only boxes above a confidence threshold
            scores = pred["scores"].to(device)
            labels = pred["labels"].to(device)
            keep = scores >= score_thresh
            pred_labels = labels[keep]

            pred_any_moving = (pred_labels == 2).any().item()

            num_frames += 1
            if bool(gt_any_moving) == bool(pred_any_moving):
                num_correct += 1

    acc = num_correct / num_frames if num_frames > 0 else 0.0
    print(f"Frame-level moving/not-moving accuracy: {acc*100:.2f}% "
          f"(threshold={score_thresh})")
    return acc

def train_detector(train_dataset, val_dataset, num_classes=2, epochs=5, lr=1e-4, batch_size=4):
    #use gpu if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    model = get_model(num_classes).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}\n----------------------------")
        train_loss = train_loop(model, train_loader, optimizer, device)
        val_loss = test_loop(model, val_loader, device)
        val_acc  = accuracy_loop(model, val_loader, device, score_thresh=0.5)
        print(f"Summary: train_loss={train_loss:.4f}, " f"val_loss={val_loss:.4f}, val_acc={val_acc*100:.2f}%")

    return model

In [6]:
#train and save trained model
if __name__ == "__main__":
    video_folder = r"C:\Users\Glen\Documents\School\BusForecasting\Final Project Folder\Raw Videos"
    xml_folder   = r"C:\Users\Glen\Documents\School\BusForecasting\Final Project Folder\Annotations"

    #Collate function for object detection
    def collate_fn(batch):
        batch = [b for b in batch if b is not None]
        frames = [b[0] for b in batch]
        targets = [b[1] for b in batch]
        return frames, targets

    #Load full dataset (videos + XMLs)
    full_dataset = BaseballVideoDataset(video_folder, xml_folder, frame_skip=1)

    #print a quick summary
    print(f"\nDataset contains {len(full_dataset)} annotated frames across videos.")

    #Split train/test
    n = len(full_dataset)
    split = int(0.8 * n)
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [split, n - split])

    #Train the model
    trained_model = train_detector(
        train_dataset,
        val_dataset,
        num_classes=3,
        epochs=5,
        lr=1e-4,
        batch_size=4)

    #Save trained model
    #torch.save(trained_model.state_dict(), "fasterrcnn_moving_detector.pth")
    #print("\nModel saved as 'fasterrcnn_moving_detector.pth'.")


 Found 47 videos with matching XMLs in C:\Users\Glen\Documents\School\BusForecasting\Final Project Folder\Raw Videos

Preloading videos and indexing frames...

Loading video: dusty_1.mov
Loaded 76 frames -> kept 76 after skipping

   [1/47] Loaded dusty_1.mov (76 frames, 76 annotated)

Loading video: IMG_7917_dusty.mov
Loaded 60 frames -> kept 60 after skipping

   [2/47] Loaded IMG_7917_dusty.mov (60 frames, 60 annotated)

Loading video: IMG_7918_dusty.mov
Loaded 76 frames -> kept 76 after skipping

   [3/47] Loaded IMG_7918_dusty.mov (76 frames, 76 annotated)

Loading video: IMG_7919_dusty.mov
Loaded 50 frames -> kept 50 after skipping

   [4/47] Loaded IMG_7919_dusty.mov (50 frames, 50 annotated)

Loading video: IMG_7942_dusty.mov
Loaded 57 frames -> kept 57 after skipping

   [5/47] Loaded IMG_7942_dusty.mov (57 frames, 57 annotated)

Loading video: IMG_7943_khem.mov
Loaded 51 frames -> kept 51 after skipping

Frame 51 out of range for IMG_7943_khem.mov (video has 51 frames) — ski




Epoch 1/5
----------------------------
Batch 0/382 | Loss: 68.7677
Batch 5/382 | Loss: 36.7203
Batch 10/382 | Loss: 33.4778
Batch 15/382 | Loss: 35.0697
Batch 20/382 | Loss: 29.4796
Batch 25/382 | Loss: 29.9194
Batch 30/382 | Loss: 22.6410
Batch 35/382 | Loss: 24.2589
Batch 40/382 | Loss: 26.1701
Batch 45/382 | Loss: 48.6600
Batch 50/382 | Loss: 42.5242
Batch 55/382 | Loss: 33.1072
Batch 60/382 | Loss: 29.1607
Batch 65/382 | Loss: 23.9734
Batch 70/382 | Loss: 34.9886
Batch 75/382 | Loss: 37.9772
Batch 80/382 | Loss: 26.1650
Batch 85/382 | Loss: 35.5376
Batch 90/382 | Loss: 17.6388
Batch 95/382 | Loss: 22.9503
Batch 100/382 | Loss: 29.1248
Batch 105/382 | Loss: 29.8221
Batch 110/382 | Loss: 31.2881
Batch 115/382 | Loss: 16.8766
Batch 120/382 | Loss: 28.4048
Batch 125/382 | Loss: 27.9400
Batch 130/382 | Loss: 36.5510
Batch 135/382 | Loss: 27.2610
Batch 140/382 | Loss: 27.2815
Batch 145/382 | Loss: 24.9686
Batch 150/382 | Loss: 23.9547
Batch 155/382 | Loss: 20.9938
Batch 160/382 | Loss: 

In [None]:
#link to .pth file in one drive, was too large to upload to github
#https://uofnebraska-my.sharepoint.com/:u:/g/personal/51628718_nebraska_edu/EeMMKq6VaAdLqhyOtLBWLckB7RRR-rMLfBkvvxsgIL99PA?e=HWLXBi

#import script for trained model
def load_trained_model(weights_path, num_classes=3, device=None):

    # Select device
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Loading model on device: {device}")

    # Recreate model architecture
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    # Load saved weights
    state_dict = torch.load(weights_path, map_location=device)
    model.load_state_dict(state_dict)

    # Move model to device and set eval mode
    model.to(device)
    model.eval()

    print(f"Model loaded successfully from '{weights_path}'")
    return model