In [1]:
!pip install -q torch torchvision torchaudio
!pip install -q opencv-python-headless
!pip install -q pillow
!pip install -q numpy

## Provided function to load the model || Function to process the video and draw boxes around moving baseball

In [2]:
import os
import cv2
import torch
import torchvision
import numpy as np
from torch.utils.data import Dataset, DataLoader
import xml.etree.ElementTree as ET
from collections import defaultdict
from PIL import Image
from google.colab.patches import cv2_imshow   # ← this line enables cv2_imshow in Colab
import torchvision.transforms as T
from google.colab import files

# Provided function to load the model
def load_trained_model(weights_path, num_classes=3, device=None):
    # Select device
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Loading model on device: {device}")

    # Recreate model architecture
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

    # Load saved weights
    state_dict = torch.load(weights_path, map_location=device)
    model.load_state_dict(state_dict)

    # Move model to device and set eval mode
    model.to(device)
    model.eval()

    print(f"Model loaded successfully from '{weights_path}'")
    return model

# Function to process the video and draw boxes around moving baseball
def process_video(video_path, output_path, weights_path="fasterrcnn_moving_detector.pth"):
    # Load the model
    model = load_trained_model(weights_path)

    # Open the input video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error opening video file: {video_path}")
        return

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    print(f"Original video size: {orig_width}x{orig_height}, FPS: {fps}")

    # Define resized dimensions
    new_width = 1280
    new_height = 720

    # Create output video writer (using mp4v codec for .mp4; change to 'MOV ' if needing .mov)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(output_path, fourcc, fps, (new_width, new_height))

    # Device from model
    device = next(model.parameters()).device

    # Preprocessing transforms
    transform = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        print(f"Processing frame {frame_count}...")

        # Resize the frame
        frame_resized = cv2.resize(frame, (new_width, new_height))

        # Convert to RGB and PIL Image
        rgb_frame = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb_frame)

        # Apply transforms and move to device
        image_tensor = transform(pil_image).to(device)

        # Run inference
        with torch.no_grad():
            predictions = model([image_tensor])[0]

        # Extract predictions
        boxes = predictions['boxes'].cpu().numpy()
        labels = predictions['labels'].cpu().numpy()
        scores = predictions['scores'].cpu().numpy()

        # Draw boxes for moving baseball (assume label 2 = moving; adjust if needed)
        for i in range(len(scores)):
            if scores[i] > 0.5 and labels[i] == 2:
                x1, y1, x2, y2 = map(int, boxes[i])
                # Draw red rectangle (BGR color)
                cv2.rectangle(frame_resized, (x1, y1), (x2, y2), (0, 0, 255), thickness=2)

        # Write the annotated frame to output
        writer.write(frame_resized)

    # Release resources
    cap.release()
    writer.release()
    print(f"Processing complete. Output saved to {output_path}")

##process_video_colab (version 2)

In [3]:
def process_video_colab2(
                       video_path,
                       output_path="output_with_boxes_v2.mp4",
                       weights_path="fasterrcnn_moving_detector.pth",
                       general_conf_threshold=0.3,
                       thrown_ball_label=3,
                       thrown_ball_min_score=0.35): #thrown_ball_min_score to make the detection more sensitive.

    # Load model (your existing function)
    model = load_trained_model(weights_path, num_classes=3)
    device = next(model.parameters()).device
    print(f"Model loaded on {device}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video")
        return

    fps = cap.get(cv2.CAP_PROP_FPS)
    # Get original video dimensions
    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Use original dimensions for output video
    width  = orig_width
    height = orig_height

    # Video writer to save the result
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    transform = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    print("Starting processing... We will display every frame and print detection info if a thrown ball is found.")

    frame_idx = 0
    found_thrown_ball = False

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_idx += 1
        # Do not resize the frame, use original
        display_frame = frame.copy() # Use original frame for display and writing

        # Prepare input (use original frame for inference as well)
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb)
        img_tensor = transform(pil_image).unsqueeze(0).to(device)

        # Inference
        with torch.no_grad():
            pred = model(img_tensor)[0]

        boxes  = pred['boxes'].cpu().numpy()
        labels = pred['labels'].cpu().numpy()
        scores = pred['scores'].cpu().numpy()

        thrown_ball_detections_this_frame = []

        for i, (box, label, score) in enumerate(zip(boxes, labels, scores)):
            # Apply general confidence threshold for drawing all boxes
            if score > general_conf_threshold:
                x1, y1, x2, y2 = map(int, box)

                color = (0, 255, 0) # Default green for other detections
                label_text = f"Class {label} ({score:.2f})"

                if label == thrown_ball_label:
                    color = (0, 0, 255) # Red for thrown ball
                    if score > thrown_ball_min_score:
                        thrown_ball_detections_this_frame.append((box, score))
                        found_thrown_ball = True

                # Draw rectangle and put text for all confident detections
                cv2.rectangle(display_frame, (x1, y1), (x2, y2), color, 3)
                cv2.putText(display_frame, label_text, (x1, y1-8),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)

        # Display every frame
        cv2_imshow(display_frame)

        # Print detection info only if a thrown ball is found in this frame
        if thrown_ball_detections_this_frame:
            # Get max score for the thrown ball label in this frame
            max_thrown_ball_score = max([s for _, s in thrown_ball_detections_this_frame] or [0])
            print(f"Frame {frame_idx:4d} → Thrown ball (Class {thrown_ball_label}) detected with confidence: {max_thrown_ball_score:.3f}")
            print("-" * 60)

        # Always write frame to output video
        writer.write(display_frame)

    cap.release()
    writer.release()
    print(f"\nFinished! Video saved as → {output_path}")
    if found_thrown_ball:
        print("Thrown ball (Class 2) successfully detected in some frames!")
    else:
        print("Warning: No confident thrown ball (Class 2) found with the given thresholds. Try lowering `thrown_ball_min_score` or check if `thrown_ball_label` is correct.")

In [6]:
# Example call for the new function
process_video_colab2(
    video_path="IMG_000.mov",
    output_path="IMG.Box.mp4",
    general_conf_threshold=0.15,  # General threshold for any detection
    thrown_ball_label=2,          # class for the 'thrown ball'
    thrown_ball_min_score=0.15    # Specific confidence for the thrown ball
)

Loading model on device: cpu




Model loaded successfully from 'fasterrcnn_moving_detector.pth'
Model loaded on cpu
Error opening video
