# Download KTH Dataset

In [1]:
# !pip install ultralytics

In [2]:
import os
import subprocess
import glob
import shutil
from ultralytics import YOLO
from tqdm import tqdm
# Initialize YOLO model once
model = YOLO('yolov8n.pt')


In [3]:

def process_all_videos(root_folder, output_base):
    """
    Processes all videos inside subfolders of a given root folder.

    Args:
        root_folder (str): Path containing multiple subfolders with videos.
        output_base (str): Path to store processed frames and pairs.
    """
    for subdir, _, files in tqdm(os.walk(root_folder)):
        video_files = [f for f in files if f.endswith(('.mp4', '.avi', '.mov'))]
        if not video_files:
            continue  # Skip if no videos in this subfolder

        # Define output paths
        relative_path = os.path.relpath(subdir, root_folder)
        frames_output = os.path.join(output_base, "frames", relative_path)
        person_output = os.path.join(output_base, "person_frames", relative_path)
        pairs_output = os.path.join(output_base, "paired_dataset", relative_path)

        os.makedirs(frames_output, exist_ok=True)
        os.makedirs(person_output, exist_ok=True)
        os.makedirs(pairs_output, exist_ok=True)

        # Process each video
        for video in video_files:
            video_path = os.path.join(subdir, video)
            extract_frames_ffmpeg(video_path, frames_output, frame_rate=1)

        # Detect persons in frames
        detect_person(frames_output, person_output)

        # Create frame pairs
        create_frame_pairs(person_output, pairs_output)

def extract_frames_ffmpeg(video_path, output_folder, frame_rate=1):
    """ Extracts frames from a video using FFmpeg. """
    os.makedirs(output_folder, exist_ok=True)
    output_pattern = os.path.join(output_folder, os.path.splitext(os.path.basename(video_path))[0] + "_%04d.jpg")
    command = ['ffmpeg', '-i', video_path, '-vf', f'fps={frame_rate}', output_pattern]
    subprocess.run(command, check=True)

def detect_person(input_folder, output_folder):
    """ Detects persons in frames using YOLOv8. """
    os.makedirs(output_folder, exist_ok=True)
    for frame in sorted(os.listdir(input_folder)):
        frame_path = os.path.join(input_folder, frame)
        results = model(frame_path)
        for result in results:
            if 0 in result.boxes.cls:
                shutil.copy(frame_path, os.path.join(output_folder, frame))
                break

def create_frame_pairs(input_folder, output_folder):
    """ Creates pairs of consecutive frames with persons. """
    os.makedirs(output_folder, exist_ok=True)
    frames = sorted(glob.glob(f"{input_folder}/*.jpg"))
    for i in range(len(frames) - 1):
        pair_folder = os.path.join(output_folder, f"pair_{i:04d}")
        os.makedirs(pair_folder, exist_ok=True)
        shutil.copy(frames[i], os.path.join(pair_folder, 'image_1.jpg'))
        shutil.copy(frames[i + 1], os.path.join(pair_folder, 'image_2.jpg'))


In [4]:
input_video_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset"
output_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset_2"
# Call the function with your input folder structure
process_all_videos(input_video_folder, output_folder)


0it [00:00, ?it/s]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --en


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Boxing/person01_boxing_d1_uncomp_0001.jpg: 480x640 1 person, 88.9ms
Speed: 4.6ms preprocess, 88.9ms inference, 2.7ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Boxing/person01_boxing_d1_uncomp_0002.jpg: 480x640 1 person, 6.0ms
Speed: 3.0ms preprocess, 6.0ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Boxing/person01_boxing_d1_uncomp_0003.jpg: 480x640 1 person, 5.0ms
Speed: 1.2ms preprocess, 5.0ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Boxing/person01_boxing_d1_uncomp_0004.jpg: 480x640 1 person, 6.0ms
Speed: 1.3ms preprocess, 6.0ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/fra

2it [00:59, 29.70s/it]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis 


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandClapping/person01_handclapping_d1_uncomp_0001.jpg: 480x640 1 person, 5.9ms
Speed: 9.1ms preprocess, 5.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandClapping/person01_handclapping_d1_uncomp_0002.jpg: 480x640 1 person, 5.6ms
Speed: 1.3ms preprocess, 5.6ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandClapping/person01_handclapping_d1_uncomp_0003.jpg: 480x640 1 person, 5.7ms
Speed: 1.5ms preprocess, 5.7ms inference, 49.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandClapping/person01_handclapping_d1_uncomp_0004.jpg: 480x640 1 person, 6.1ms
Speed: 1.7ms preprocess, 6.1ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jo

4it [02:02, 30.84s/it]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis 


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandWaving/person01_handwaving_d1_uncomp_0001.jpg: 480x640 1 person, 6.1ms
Speed: 9.1ms preprocess, 6.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandWaving/person01_handwaving_d1_uncomp_0002.jpg: 480x640 1 person, 6.2ms
Speed: 1.4ms preprocess, 6.2ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandWaving/person01_handwaving_d1_uncomp_0003.jpg: 480x640 1 person, 5.8ms
Speed: 1.3ms preprocess, 5.8ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)



ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandWaving/person01_handwaving_d1_uncomp_0004.jpg: 480x640 1 person, 59.4ms
Speed: 1.4ms preprocess, 59.4ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandWaving/person01_handwaving_d1_uncomp_0005.jpg: 480x640 1 person, 5.5ms
Speed: 1.3ms preprocess, 5.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandWaving/person01_handwaving_d1_uncomp_0006.jpg: 480x640 1 person, 5.6ms
Speed: 1.4ms preprocess, 5.6ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/HandWaving/person01_handwaving_d1_uncomp_0007.jpg: 480x640 1 person, 5.5ms
Speed: 1.2ms preprocess, 5.5ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-stora

5it [02:54, 37.00s/it]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis 


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Jogging/person01_jogging_d1_uncomp_0001.jpg: 480x640 1 person, 8.8ms
Speed: 10.6ms preprocess, 8.8ms inference, 1.4ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Jogging/person01_jogging_d1_uncomp_0002.jpg: 480x640 1 person, 5.4ms
Speed: 1.3ms preprocess, 5.4ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Jogging/person01_jogging_d1_uncomp_0003.jpg: 480x640 (no detections), 5.5ms
Speed: 1.3ms preprocess, 5.5ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Jogging/person01_jogging_d1_uncomp_0004.jpg: 480x640 (no detections), 5.5ms
Speed: 1.2ms preprocess, 5.5ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_fi

6it [03:40, 39.57s/it]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis 


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d1_uncomp_0001.jpg: 480x640 1 person, 5.6ms
Speed: 14.6ms preprocess, 5.6ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d1_uncomp_0002.jpg: 480x640 (no detections), 5.3ms
Speed: 1.5ms preprocess, 5.3ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d1_uncomp_0003.jpg: 480x640 (no detections), 4.9ms
Speed: 1.4ms preprocess, 4.9ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d1_uncomp_0004.jpg: 480x640 (no detections), 5.0ms
Speed: 1.2ms preprocess, 5.0ms inference, 0.4ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/

frame=   19 fps=0.0 q=1.6 Lsize=N/A time=00:00:19.00 bitrate=N/A speed= 237x    
video:87kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d2_uncomp_0005.jpg: 480x640 1 person, 6.5ms
Speed: 1.2ms preprocess, 6.5ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d2_uncomp_0006.jpg: 480x640 (no detections), 4.8ms
Speed: 1.2ms preprocess, 4.8ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d2_uncomp_0007.jpg: 480x640 (no detections), 4.5ms
Speed: 1.2ms preprocess, 4.5ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Running/person01_running_d2_uncomp_0008.jpg: 480x640 (no detections), 4.4ms
Speed: 1.2ms preprocess, 4.4ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/am

7it [04:24, 41.15s/it]ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis 


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d1_uncomp_0001.jpg: 480x640 1 person, 5.9ms
Speed: 9.1ms preprocess, 5.9ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d1_uncomp_0002.jpg: 480x640 1 person, 5.6ms
Speed: 1.3ms preprocess, 5.6ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d1_uncomp_0003.jpg: 480x640 1 person, 5.1ms
Speed: 1.2ms preprocess, 5.1ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d1_uncomp_0004.jpg: 480x640 (no detections), 5.3ms
Speed: 1.2ms preprocess, 5.3ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_

frame=   25 fps=0.0 q=1.6 Lsize=N/A time=00:00:25.00 bitrate=N/A speed= 145x    
video:123kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown


image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d1_uncomp_0021.jpg: 480x640 1 person, 4.8ms
Speed: 1.2ms preprocess, 4.8ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d1_uncomp_0022.jpg: 480x640 1 person, 5.5ms
Speed: 1.2ms preprocess, 5.5ms inference, 0.7ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d2_uncomp_0001.jpg: 480x640 (no detections), 4.8ms
Speed: 1.2ms preprocess, 4.8ms inference, 0.3ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_Dataset_2/frames/Walking/person01_walking_d2_uncomp_0002.jpg: 480x640 1 person, 5.0ms
Speed: 1.1ms preprocess, 5.0ms inference, 0.8ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /home/jovyan/video-storage/amit_files/KTH_D

8it [05:29, 41.16s/it]


# Download KTH Dataset frames which contains full people in the frame

In [1]:
!pip install ultralytics

Collecting ultralytics
  Obtaining dependency information for ultralytics from https://files.pythonhosted.org/packages/78/62/4e68d12ae5c61d77322dc36ed8100f137ed5d10b3a559a45921f9139f775/ultralytics-8.3.98-py3-none-any.whl.metadata
  Downloading ultralytics-8.3.98-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Obtaining dependency information for ultralytics-thop>=2.0.0 from https://files.pythonhosted.org/packages/a6/10/251f036b4c5d77249f9a119cc89dafe8745dc1ad1f1a5f06b6a3988ca454/ultralytics_thop-2.0.14-py3-none-any.whl.metadata
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.98-py3-none-any.whl (949 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.0/950.0 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ult

In [10]:
import os
import subprocess
import glob
import shutil
from tqdm import tqdm
from ultralytics import YOLO
import cv2

# Initialize YOLO model once
model = YOLO('yolov8n.pt')

HIGH_FPS_CATEGORIES = {'Jogging', 'Running', 'Walking'}
LOW_FPS_CATEGORIES = {'Boxing', 'HandClapping', 'HandWaving'}


In [11]:

def process_all_videos(root_folder, output_base):
    """
    Processes all videos inside subfolders of a given root folder.

    Args:
        root_folder (str): Path containing multiple subfolders with videos.
        output_base (str): Path to store processed frames and pairs.
    """
    for subdir, _, files in tqdm(os.walk(root_folder)):
        video_files = [f for f in files if f.endswith(('.mp4', '.avi', '.mov'))]
        if not video_files:
            continue  # Skip if no videos in this subfolder

        # Determine FPS based on category (subfolder name)
        category = os.path.basename(subdir)
        frame_rate = 10 if category in HIGH_FPS_CATEGORIES else 4

        # Define output paths
        relative_path = os.path.relpath(subdir, root_folder)
        frames_output = os.path.join(output_base, "frames", relative_path)
        person_output = os.path.join(output_base, "person_frames", relative_path)
        pairs_output = os.path.join(output_base, "paired_dataset", relative_path)

        os.makedirs(frames_output, exist_ok=True)
        os.makedirs(person_output, exist_ok=True)
        os.makedirs(pairs_output, exist_ok=True)

        # Process each video
        for video in video_files:
            video_path = os.path.join(subdir, video)
            extract_frames_ffmpeg(video_path, frames_output, frame_rate)

        # Detect full persons in frames
        filter_full_person(frames_output, person_output)

        # Create frame pairs
        create_frame_pairs(person_output, pairs_output)

def extract_frames_ffmpeg(video_path, output_folder, frame_rate):
    """ Extracts frames from a video using FFmpeg. """
    os.makedirs(output_folder, exist_ok=True)
    output_pattern = os.path.join(output_folder, os.path.splitext(os.path.basename(video_path))[0] + "_%04d.jpg")
    command = ['ffmpeg', '-i', video_path, '-vf', f'fps={frame_rate}', output_pattern]
    subprocess.run(command, check=True)

def filter_full_person(input_folder, output_folder, edge_threshold=0.05, min_height_ratio=0.4):
    """
    Detects full persons in frames using YOLOv8 and filters out cropped persons.

    Args:
        input_folder (str): Path to extracted frames.
        output_folder (str): Path to store frames with full persons.
        edge_threshold (float): Percentage of image size considered "too close" to edges.
        min_height_ratio (float): Minimum height ratio for detecting a full person.
    """
    os.makedirs(output_folder, exist_ok=True)
    
    for frame in sorted(os.listdir(input_folder)):
        frame_path = os.path.join(input_folder, frame)
        img = cv2.imread(frame_path)
        img_height, img_width = img.shape[:2]

        results = model(frame_path)
        
        for result in results:
            for box in result.boxes:
                if int(box.cls[0]) == 0:  # Class 0 = Person
                    x_min, y_min, x_max, y_max = map(int, box.xyxy[0])

                    # Compute bounding box dimensions
                    box_width = x_max - x_min
                    box_height = y_max - y_min

                    # Check if the bounding box is too close to the edges
                    if (
                        x_min < edge_threshold * img_width or x_max > (1 - edge_threshold) * img_width or
                        y_min < edge_threshold * img_height or y_max > (1 - edge_threshold) * img_height
                    ):
                        continue  # Skip this frame if person is cropped

                    # Ensure the person is at least 'min_height_ratio' of the image height
                    if box_height / img_height >= min_height_ratio:
                        shutil.copy(frame_path, os.path.join(output_folder, frame))
                        break  # Only need one valid detection per frame

def create_frame_pairs(input_folder, output_folder):
    """ Creates pairs of consecutive frames with full persons. """
    os.makedirs(output_folder, exist_ok=True)
    frames = sorted(glob.glob(f"{input_folder}/*.jpg"))
    for i in range(len(frames) - 1):
        pair_folder = os.path.join(output_folder, f"pair_{i:04d}")
        os.makedirs(pair_folder, exist_ok=True)
        shutil.copy(frames[i], os.path.join(pair_folder, 'image_1.jpg'))
        shutil.copy(frames[i + 1], os.path.join(pair_folder, 'image_2.jpg'))


In [None]:
input_video_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset"
output_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset_2"
# Call the function with your input folder structure
process_all_videos(input_video_folder, output_folder)

# Make KTH Dataset and take care of direction of person face


In [7]:
import os
import shutil
import glob
from tqdm import tqdm
from ultralytics import YOLO
import cv2
import numpy as np

# Load YOLOv8-Pose Model
pose_model = YOLO('yolov8n-pose.pt')


In [8]:

def process_all_videos(root_folder, output_base):
    """
    Processes all videos inside subfolders of a given root folder.

    Args:
        root_folder (str): Path containing multiple subfolders with videos.
        output_base (str): Path to store processed frames and pairs.
    """
    for subdir, _, files in tqdm(os.walk(root_folder)):
        video_files = [f for f in files if f.endswith(('.mp4', '.avi', '.mov'))]
        if not video_files:
            continue  # Skip if no videos in this subfolder

        # Define output paths
        relative_path = os.path.relpath(subdir, root_folder)
        frames_output = os.path.join(output_base, "frames", relative_path)
        person_output = os.path.join(output_base, "person_frames", relative_path)
        pairs_output = os.path.join(output_base, "paired_dataset", relative_path)

        os.makedirs(frames_output, exist_ok=True)
        os.makedirs(person_output, exist_ok=True)
        os.makedirs(pairs_output, exist_ok=True)

        # Process each video
        for video in video_files:
            video_path = os.path.join(subdir, video)
            extract_frames_ffmpeg(video_path, frames_output, frame_rate=1)

        # Detect persons in frames
        filter_full_person(frames_output, person_output)

        # Create frame pairs
        create_frame_pairs(person_output, pairs_output)

        # Remove pairs where persons face opposite directions
        remove_opposite_facing_pairs(pairs_output)

def extract_frames_ffmpeg(video_path, output_folder, frame_rate=1):
    """ Extracts frames from a video using FFmpeg. """
    os.makedirs(output_folder, exist_ok=True)
    output_pattern = os.path.join(output_folder, os.path.splitext(os.path.basename(video_path))[0] + "_%04d.jpg")
    command = ['ffmpeg', '-i', video_path, '-vf', f'fps={frame_rate}', output_pattern]
    subprocess.run(command, check=True)

def filter_full_person(input_folder, output_folder):
    """ Filters images to include only frames with a full person. """
    os.makedirs(output_folder, exist_ok=True)
    for frame in sorted(os.listdir(input_folder)):
        frame_path = os.path.join(input_folder, frame)
        results = pose_model(frame_path)
        for result in results:
            if 0 in result.boxes.cls:
                shutil.copy(frame_path, os.path.join(output_folder, frame))
                break

def create_frame_pairs(input_folder, output_folder):
    """ Creates pairs of consecutive frames with persons. """
    os.makedirs(output_folder, exist_ok=True)
    frames = sorted(glob.glob(f"{input_folder}/*.jpg"))
    for i in range(len(frames) - 1):
        pair_folder = os.path.join(output_folder, f"pair_{i:04d}")
        os.makedirs(pair_folder, exist_ok=True)
        shutil.copy(frames[i], os.path.join(pair_folder, 'image_1.jpg'))
        shutil.copy(frames[i + 1], os.path.join(pair_folder, 'image_2.jpg'))

def estimate_facing_direction(image_path):
    """ Estimates the facing direction of a person based on keypoints. """
    results = pose_model(image_path)
    for result in results:
        keypoints = result.keypoints.xy[0]  # Get keypoints
        left_shoulder, right_shoulder = keypoints[5], keypoints[6]

        # Calculate relative position
        if left_shoulder[0] > right_shoulder[0]:  # Left shoulder is to the right
            return "right"
        else:
            return "left"
    return None  # No person detected

def remove_opposite_facing_pairs(pair_folder):
    """ Deletes frame pairs where persons face opposite directions. """
    pairs = sorted(os.listdir(pair_folder))

    for pair in pairs:
        pair_path = os.path.join(pair_folder, pair)
        image_1_path = os.path.join(pair_path, 'image_1.jpg')
        image_2_path = os.path.join(pair_path, 'image_2.jpg')

        if not os.path.exists(image_1_path) or not os.path.exists(image_2_path):
            continue  # Skip if images are missing

        direction_1 = estimate_facing_direction(image_1_path)
        direction_2 = estimate_facing_direction(image_2_path)

        if direction_1 and direction_2 and direction_1 != direction_2:
            shutil.rmtree(pair_path)  # Delete the pair


In [None]:
input_video_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset"
output_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset_2"
# Call the function with your input folder structure
process_all_videos(input_video_folder, output_folder)


# Make two folders name as First and Second

In [1]:
import os
import shutil

def separate_images(root_folder, output_base):
    """
    Separates paired images into 'first' and 'second' folders.

    Args:
        root_folder (str): Path containing 6 subfolders with pair directories.
        output_base (str): Path to store separated images.
    """
    first_output = os.path.join(output_base, "first")
    second_output = os.path.join(output_base, "second")

    os.makedirs(first_output, exist_ok=True)
    os.makedirs(second_output, exist_ok=True)

    # Iterate through each subfolder
    for subdir in os.listdir(root_folder):
        subdir_path = os.path.join(root_folder, subdir)
        if not os.path.isdir(subdir_path):
            continue  # Skip if not a folder

        # Iterate through each pair folder inside subdir
        for pair_folder in os.listdir(subdir_path):
            pair_path = os.path.join(subdir_path, pair_folder)
            if not os.path.isdir(pair_path):
                continue  # Skip if not a folder

            image_1_path = os.path.join(pair_path, "image_1.jpg")
            image_2_path = os.path.join(pair_path, "image_2.jpg")

            if os.path.exists(image_1_path) and os.path.exists(image_2_path):
                shutil.copy(image_1_path, os.path.join(first_output, f"{subdir}_{pair_folder}_1.jpg"))
                shutil.copy(image_2_path, os.path.join(second_output, f"{subdir}_{pair_folder}_2.jpg"))

    print(f"Images sorted into '{first_output}' and '{second_output}'.")


In [None]:
# Run the function
path_to_root_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset_2/paired_dataset"
path_to_output_folder = "/home/jovyan/video-storage/amit_files/KTH_Dataset_2/paired_training_images"
separate_images(path_to_root_folder, "path_to_output_folder")


In [2]:
first_images = os.listdir("/home/jovyan/video-storage/amit_files/KTH_Dataset_2/paired_images/first_images")
second_images = os.listdir("/home/jovyan/video-storage/amit_files/KTH_Dataset_2/paired_images/second_images")

In [4]:
print(first_images[0])

Boxing_pair_0000_1.jpg


In [13]:
import matplotlib.pyplot as plt
import numpy as np

from PIL import Image
img = Image.open("/home/jovyan/video-storage/amit_files/KTH_Dataset_2/paired_images/first_images" + "/" + first_images[1])
img = np.asarray(img)
print(img.shape)

(120, 160, 3)


# Making the .yml file for KTH Dataset

In [None]:
import yaml

data = {
    "batch_size": 16,
    "checkpoint_freq": 1000,
    "data_aug_max_rotate": 0.1,
    "data_aug_tps_cntl_pts": 4,
    "data_aug_tps_variance": 0.05,
    "data_aug_type": None,
    "dataset_1": "first_images",
    "dataset_2": "second_images",
    "dataset_randomize": False,
    "dataset_test_len": 274,
    "dataset_train_len": 34000,
    "epochs": 24,
    "model_in_channels": 1,
    "model_keypoints": 10,
    "model_type": "F",
    "model_z_channels": 64,
    "opt_level": "O2"
}

# Save to a YAML file
with open("keypoints_KTH.yml", "w") as file:
    yaml.dump(data, file, default_flow_style=False)
