In [1]:
import sys
module_path = '/content/drive/MyDrive/Colab Notebooks'
sys.path.append(module_path)

In [2]:
# Import necessary libraries
import json
import os
from os import cpu_count
from pathlib import Path
from functools import partial
from glob import glob
from multiprocessing.pool import Pool

import cv2
from tqdm import tqdm

from utils import get_video_paths, get_method_from_name

# Set environment variables to limit thread usage
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

cv2.ocl.setUseOpenCL(False)
cv2.setNumThreads(0)

In [3]:
def extract_video(video, root_dir, dataset):
    try:
        # Set the path for bounding boxes JSON
        if dataset == 0:
            bboxes_path = os.path.join(
                root_dir, "boxes", os.path.splitext(os.path.basename(video))[0] + ".json"
            )
        else:
            bboxes_path = os.path.join(
                root_dir, "boxes", get_method_from_name(video), os.path.splitext(os.path.basename(video))[0] + ".json"
            )

        # Check if JSON or video file exists
        if not os.path.exists(bboxes_path) or not os.path.exists(video):
            return

        # Load bounding boxes
        with open(bboxes_path, "r") as bbox_f:
            bboxes_dict = json.load(bbox_f)

        # Open the video
        capture = cv2.VideoCapture(video)
        frames_num = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
        counter = 0

        for i in range(frames_num):
            capture.grab()
            success, frame = capture.retrieve()
            if not success or str(i) not in bboxes_dict:
                continue

            id = os.path.splitext(os.path.basename(video))[0]
            crops = []
            bboxes = bboxes_dict[str(i)]
            if bboxes is None:
                continue
            else:
                counter += 1

            # Process bounding boxes
            for bbox in bboxes:
                xmin, ymin, xmax, ymax = [int(b * 2) for b in bbox]
                w = xmax - xmin
                h = ymax - ymin
                p_h, p_w = 0, 0

                if h > w:
                    p_w = int((h - w) / 2)
                elif h < w:
                    p_h = int((w - h) / 2)

                crop = frame[max(ymin - p_h, 0):ymax + p_h, max(xmin - p_w, 0):xmax + p_w]
                crops.append(crop)

            # Save cropped images
            os.makedirs(os.path.join(opt["output_path"], id), exist_ok=True)
            for j, crop in enumerate(crops):
                cv2.imwrite(os.path.join(opt["output_path"], id, f"{i}_{j}.png"), crop)

        # Log videos with no processed frames
        if counter == 0:
            print(video, counter)
    except Exception as e:
        print("Error:", e)


In [4]:
# Simulate command-line arguments with direct variables
opt = {
    "dataset": "DFDC",                # Dataset (DFDC / FACEFORENSICS)
    "data_path": "/content/drive/MyDrive/Colab Notebooks/archive/YouTube-real",     # Input videos directory
    "output_path": "/content/drive/MyDrive/Colab Notebooks/archive/YouTube-real crop"  # Directory to save cropped images
}

# Determine dataset type
dataset = 0 if opt["dataset"].upper() == "DFDC" else 1

# Create output directory if it doesn't exist
os.makedirs(opt["output_path"], exist_ok=True)


In [5]:
# Get list of already processed videos
excluded_videos = os.listdir(opt["output_path"])

# Get video paths based on dataset type
if dataset == 0:
    paths = get_video_paths(opt["data_path"], dataset, excluded_videos)
else:
    paths = get_video_paths(os.path.join(opt["data_path"], "manipulated_sequences"), dataset)
    paths.extend(get_video_paths(os.path.join(opt["data_path"], "original_sequences"), dataset))


In [None]:
# # Process videos using multiprocessing
# with Pool(processes=cpu_count() - 2) as p:
#     with tqdm(total=len(paths)) as pbar:
#         for _ in p.imap_unordered(partial(extract_video, root_dir=opt["data_path"], dataset=dataset), paths):
#             pbar.update()


ValueError: Number of processes must be at least 1

In [6]:
# Process videos using multiprocessing
import multiprocessing as mp

# Check if cpu_count() - 2 is less than 1, if so, use 1 process
num_processes = max(1, mp.cpu_count() - 2)

with mp.Pool(processes=num_processes) as p:
    with tqdm(total=len(paths)) as pbar:
        for _ in p.imap_unordered(partial(extract_video, root_dir=opt["data_path"], dataset=dataset), paths):
            pbar.update()

100%|██████████| 301/301 [39:09<00:00,  7.81s/it]
