# Preliminary


Here we install all imports and other necessary components.


## Imports


In [37]:
!pip install ultralytics
!pip install imageio
!pip install numpy
!pip install opencv-python
!pip install tqdm

import ultralytics
ultralytics.checks()

Ultralytics YOLOv8.2.80 🚀 Python-3.10.1 torch-2.4.0 CPU (Apple M1)
Setup complete ✅ (8 CPUs, 16.0 GB RAM, 351.4/460.4 GB disk)


In [38]:
import os
import cv2
import glob
import numpy as np
import xml.etree.ElementTree as ET

from IPython import display
display.clear_output()

from ultralytics import YOLO
from IPython.display import display, Image
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [39]:
# Check what GPU is available
!nvidia-smi

zsh:1: command not found: nvidia-smi


## Directories


In [40]:
# Configuration Flags
SAVE_ORIGINAL = False  # Flag to save original frames
RELEASE = True  # Flag to switch between concurrent and sequential processing

# Base directory setup
BASE_DIR = Path("/Users/jan/Documents/code/cv/project")

# Training set directories
TRAIN_VIDEO_DIR = BASE_DIR / "data/fishclef_2015_release/training_set/videos"
TRAIN_GT_DIR = BASE_DIR / "data/fishclef_2015_release/training_set/gt"
TRAIN_IMG_DIR = BASE_DIR / "train_img/"
TRAIN_GMM_DIR = BASE_DIR / "train_gmm/"
TRAIN_OPTICAL_DIR = BASE_DIR / "train_optical/"
TRAIN_GMM_OPTICAL_DIR = BASE_DIR / "train_gmm_optical/"

# Test set directories
TEST_VIDEO_DIR = BASE_DIR / "data/fishclef_2015_release/test_set/videos"
TEST_GT_DIR = BASE_DIR / "data/fishclef_2015_release/test_set/gt"
TEST_IMG_DIR = BASE_DIR / "test_img/"
TEST_GMM_DIR = BASE_DIR / "test_gmm/"
TEST_OPTICAL_DIR = BASE_DIR / "test_optical/"
TEST_GMM_OPTICAL_DIR = BASE_DIR / "test_gmm_optical/"

# List of species names
SPECIES_LIST = [
    "abudefduf vaigiensis",
    "acanthurus nigrofuscus",
    "amphiprion clarkii",
    "chaetodon lununatus",
    "chaetodon speculum",
    "chaetodon trifascialis",
    "chromis chrysura",
    "dascyllus aruanus",
    "dascyllus reticulatus",
    "hemigumnus malapterus",
    "myripristis kuntee",
    "neoglyphidodon nigroris",
    "pempheris vanicolensis",
    "plectrogly-phidodon dickii",
    "zebrasoma scopas",
]

# Label for unknown species
UNKNOWN_LABEL = 15

# GMM Foreground detection parameters
FOREGROUND_DETECTOR_PARAMS = {
    "history": 250,
    "varThreshold": 16,
    "detectShadows": True,
}

# Blob analysis parameters
BLOB_ANALYSIS_PARAMS = {"min_area": 200}

# Structuring elements for morphological operations
STRUCTURING_ELEMENT_OPEN = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
STRUCTURING_ELEMENT_CLOSE = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))

# Frame processing parameters
FRAME_RESIZE = (640, 640)
FRAME_ADJUST_GAMMA = 1.5

# Optical flow parameters
FARNEBACK_PARAMS = {
    "pyr_scale": 0.5,
    "levels": 3,
    "winsize": 15,
    "iterations": 3,
    "poly_n": 5,
    "poly_sigma": 1.2,
    "flags": 0,
}

---


# Create training data


In [42]:
def adjust_gamma(image, gamma=1.0):
    """
    Adjusts the gamma of an image.

    Args:
        image (np.ndarray): Input image.
        gamma (float): Gamma value to adjust (default is 1.0).

    Returns:
        np.ndarray: Gamma adjusted image.
    """
    invGamma = 1.0 / gamma
    table = np.array([(i / 255.0) ** invGamma * 255 for i in range(256)], dtype="uint8")
    return cv2.LUT(image, table)

In [None]:
def get_annotation(
    name,
    annotation_file_path,
    bboxes,
    image_width,
    image_height,
    species_key="fish_species",
):
    """
    Generates YOLO format annotations for bounding boxes and saves them to files.

    Args:
        name (str): Name prefix for saved annotation files.
        annotation_file_path (Path): Path where annotation files will be saved.
        bboxes (list): List of bounding boxes for the frame.
        image_width (int): Width of the image.
        image_height (int): Height of the image.
        species_key (str): Key for accessing species name in bbox dictionary (default is 'fish_species').
    """
    frame_bboxes = {}
    for bbox in bboxes:
        frame_id = bbox["frame_id"]
        frame_bboxes.setdefault(frame_id, []).append(bbox)

    for frame_id, bboxes in frame_bboxes.items():
        annotations = []
        for fish in bboxes:
            fish_species = fish.get(species_key, "").lower()
            x, y, width, height = (
                fish.get("x", 0),
                fish.get("y", 0),
                fish.get("w", 0),
                fish.get("h", 0),
            )
            x_center = (x + width / 2.0) / image_width
            y_center = (y + height / 2.0) / image_height
            width /= image_width
            height /= image_height
            species_index = (
                SPECIES_LIST.index(fish_species)
                if fish_species in SPECIES_LIST
                else UNKNOWN_LABEL
            )
            annotations.append(
                f"{species_index} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"
            )

        frame_annotation_file = annotation_file_path / f"{name}_{frame_id:04d}.txt"
        with open(frame_annotation_file, "w") as file:
            file.write("\n".join(annotations))

In [46]:
def extract_ground_truth(video_path):
    """
    Extracts ground truth annotations from the corresponding XML file.

    Args:
        video_path (Path): Path to the video file.

    Returns:
        list: List of ground truth bounding boxes extracted from XML.
    """
    file_name_without_ext = video_path.stem
    gt_xml_path = TRAIN_GT_DIR / f"{file_name_without_ext}.xml"

    if not gt_xml_path.exists():
        print(f"Ground truth XML not found: {gt_xml_path}")
        return []

    tree = ET.parse(gt_xml_path)
    root = tree.getroot()

    ground_truth = []
    for frame in root.findall("frame"):
        frame_id = int(frame.get("id"))
        for obj in frame.findall("object"):
            ground_truth.append(
                {
                    "frame_id": frame_id,
                    "fish_species": obj.get("fish_species"),
                    "x": int(obj.get("x")),
                    "y": int(obj.get("y")),
                    "w": int(obj.get("w")),
                    "h": int(obj.get("h")),
                }
            )

    return ground_truth

In [47]:
def apply_gmm(frame, foreground_detector):
    """
    Applies GMM (Gaussian Mixture Model) to detect foreground objects in a frame.

    Args:
        frame (np.ndarray): Input frame.
        foreground_detector (cv2.BackgroundSubtractorMOG2): Foreground detector.

    Returns:
        np.ndarray: Filtered foreground mask.
    """
    foreground = foreground_detector.apply(frame)
    filtered_foreground = cv2.morphologyEx(
        foreground, cv2.MORPH_OPEN, STRUCTURING_ELEMENT_OPEN
    )
    filtered_foreground = cv2.morphologyEx(
        filtered_foreground, cv2.MORPH_CLOSE, STRUCTURING_ELEMENT_CLOSE
    )
    return filtered_foreground

In [48]:
def apply_optical_flow(frame, prvs, hsv):
    """
    Computes optical flow using Farneback method and visualizes it in HSV space.

    Args:
        frame (np.ndarray): Input frame.
        prvs (np.ndarray): Previous frame in grayscale.
        hsv (np.ndarray): HSV image used for optical flow visualization.

    Returns:
        tuple: Tuple containing resized BGR image of the flow and next grayscale frame.
    """
    next_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    flow = cv2.calcOpticalFlowFarneback(prvs, next_frame, None, **FARNEBACK_PARAMS)
    mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
    hsv[..., 0] = ang * 180 / np.pi / 2
    hsv[..., 2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
    bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    bgr_resized = cv2.resize(bgr, FRAME_RESIZE)
    return bgr_resized, next_frame

In [49]:
def apply_combination(
    frame, frame_idx, filtered_foreground, bgr_resized, gt_bboxes, combined_dir
):
    """
    Combines the results of GMM and optical flow, and saves the combined image and annotations.

    Args:
        frame (np.ndarray): Original frame.
        frame_idx (int): Frame index.
        filtered_foreground (np.ndarray): Foreground mask obtained from GMM.
        bgr_resized (np.ndarray): Optical flow visualization in BGR format.
        gt_bboxes (list): List of ground truth bounding boxes.
        combined_dir (Path): Directory to save the combined image and annotations.
    """
    combined_frame = np.zeros_like(frame)
    combined_frame[:, :, 1] = filtered_foreground
    combined_frame[:, :, 2] = bgr_resized[:, :, 0]
    combined_frame_path = combined_dir / f"combined_img_{frame_idx:04d}.jpg"
    cv2.imwrite(str(combined_frame_path), combined_frame)

    if gt_bboxes:
        species_key = ""
        if "train" in str(combined_dir):
            species_key = "fish_species"
        if "test" in str(combined_dir):
            species_key = "species_name"

        get_annotation(
            "combined_img",
            combined_dir,
            gt_bboxes,
            FRAME_RESIZE[0],
            FRAME_RESIZE[1],
            species_key,
        )

In [50]:
def process_frame(
    frame,
    frame1,
    frame_idx,
    gt_bboxes,
    foreground_detector,
    prvs,
    hsv,
    img_dir,
    combined_dir,
    save_original=False,
):
    """
    Processes a single video frame by applying background subtraction (GMM) and optical flow,
    and then combines the results. Optionally saves the original frame, and stores the combined
    output along with ground truth annotations.

    This function performs the following steps for a given frame:

    1. Optionally saves the original frame to a specified directory.
    2. Applies Gaussian Mixture Model (GMM) to detect foreground objects in the frame.
    3. Computes optical flow between the current and next frame to track movement.
    4. Combines the GMM results and optical flow into a final output image.
    5. Saves the combined image and associated ground truth annotations to the specified directory.

    Args:
        frame (numpy.ndarray): The current video frame after resizing and gamma adjustment.
        frame1 (numpy.ndarray): The next video frame to compute optical flow.
        frame_idx (int): The index of the current frame in the video.
        gt_bboxes (list): Ground truth bounding boxes for objects (fish) in the frame.
        foreground_detector (cv2.BackgroundSubtractor): Foreground detector based on GMM.
        prvs (numpy.ndarray): The previous grayscale frame used for optical flow calculation.
        hsv (numpy.ndarray): The HSV image used for visualizing optical flow.
        img_dir (Path): Directory to save the original frames.
        combined_dir (Path): Directory to save the combined results of GMM and optical flow.
        save_original (bool, optional): Flag to indicate whether the original frame should be saved. Defaults to False.

    Returns:
        next_frame (numpy.ndarray): The grayscale version of the current frame (frame1) for use in the next iteration of optical flow calculation.
    """
    if save_original:
        # Save the original frame to the img_dir
        img_frame_path = img_dir / f"img_{frame_idx:04d}.png"
        cv2.imwrite(str(img_frame_path), frame)

    # Apply GMM to the frame to detect foreground objects
    foreground = apply_gmm(frame, foreground_detector)

    # Apply optical flow to the next frame
    bgr, next_frame = apply_optical_flow(frame1, prvs, hsv)

    # Combine GMM and optical flow results and save the combined image
    apply_combination(frame, frame_idx, foreground, bgr, gt_bboxes, combined_dir)

    return next_frame

In [51]:
def process_video(video_path):
    """
    Processes a video by applying background subtraction (using Gaussian Mixture Model),
    optical flow, and frame adjustments, and saves the processed frames and combined results
    along with ground truth annotations.

    This function extracts frames from the input video, performs foreground detection using
    a Gaussian Mixture Model (GMM), calculates optical flow for movement detection, and
    combines these results. The processed frames and combined images are saved in specific
    directories. Additionally, it uses ground truth bounding boxes extracted from an
    XML file for annotation purposes.

    Args:
        video_path (Path): Path to the input video file.

    Steps:
        1. Extract ground truth bounding boxes for the video from the corresponding XML file.
        2. Create directories to store processed images and combined results.
        3. Open the video and initialize background subtraction (GMM) and optical flow.
        4. Process each frame in the video:
            - Resize and adjust gamma for frame.
            - Apply background subtraction (GMM) for foreground detection.
            - Compute optical flow to detect movement.
            - Combine the results of GMM and optical flow.
            - Save the processed frames and results.
        5. Release the video capture object when done.

    The function also includes progress tracking using tqdm to visualize the video processing progress.

    Parameters:
        video_path (Path): Path to the video file being processed.

    Returns:
        None: The function processes the video, saves results, and does not return anything.
    """

    video_name_short = video_path.stem[-15:]
    img_dir = TRAIN_IMG_DIR / video_name_short
    combined_dir = TRAIN_GMM_OPTICAL_DIR / video_name_short

    for directory in [img_dir, combined_dir]:
        os.makedirs(directory, exist_ok=True)

    # Extract ground truth bounding boxes from the corresponding XML file
    gt_bboxes = extract_ground_truth(video_path)

    cap = cv2.VideoCapture(str(video_path))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    foreground_detector = cv2.createBackgroundSubtractorMOG2(
        **FOREGROUND_DETECTOR_PARAMS
    )
    ret, frame1 = cap.read()
    if not ret:
        print(f"Failed to read the video file: {video_path}")
        return

    prvs = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    hsv = np.zeros_like(frame1)
    hsv[..., 1] = 255
    frame_idx = 0

    # Process each frame of the video
    with tqdm(total=total_frames, desc=f"Processing {video_name_short}") as video_pbar:
        while ret:
            frame = cv2.resize(frame1, FRAME_RESIZE)
            frame = adjust_gamma(frame, FRAME_ADJUST_GAMMA)

            # Process the current frame
            next_frame = process_frame(
                frame,
                frame1,
                frame_idx,
                gt_bboxes,
                foreground_detector,
                prvs,
                hsv,
                img_dir,
                combined_dir,
                SAVE_ORIGINAL,
            )

            video_pbar.update(1)
            prvs = next_frame
            ret, frame1 = cap.read()
            frame_idx += 1

    cap.release()

In [56]:
"""
Main entry point of the script. Processes either training or test videos.
"""
video_files = list(TRAIN_VIDEO_DIR.glob("*.flv")) + list(
    TRAIN_VIDEO_DIR.glob("*.avi")
)

if RELEASE:
    for video in video_files[:1]:
        process_video(video)
else:
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_video, video) for video in video_files]

        for future in as_completed(futures):
            try:
                future.result()
            except Exception as exc:
                print(f"An error occurred: {exc}")

Processing #201108091140_6:   0%|          | 0/300 [00:00<?, ?it/s]


TypeError: argument of type 'PosixPath' is not iterable

---


## Create train.txt for YOLO


In [54]:
output_file = BASE_DIR / "train.txt"  # Name of the output file

with open(output_file, 'w') as f:
    # Walk through all folders and subfolders
    for dirpath, _, filenames in os.walk(TRAIN_IMG_DIR):
        for filename in filenames:
            if filename.lower().endswith(".jpg"):
                full_path = os.path.join(dirpath, filename)
                f.write(full_path + "\n")

In [None]:
output_file = BASE_DIR / "test.txt"  # Name of the output file

with open(output_file, 'w') as f:
    # Walk through all folders and subfolders
    for dirpath, _, filenames in os.walk(TEST_IMG_DIR):
        for filename in filenames:
            if filename.lower().endswith(".jpg"):
                full_path = os.path.join(dirpath, filename)
                f.write(full_path + "\n")

---


In [None]:
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt

In [None]:
!python "/content/yolov7/train.py" --img 640 640 --batch 16 --epochs 10 --workers 8 --cfg "/content/yolov7/cfg/training/yolov7.yaml" --data "/content/drive/MyDrive/Colab Notebooks/CV_Project/config.yaml" --weights "/content/drive/MyDrive/Colab Notebooks/CV_Project/weights" --device 0