<h1>Content-Aware Video Cropping (Single Video at a time)</h1>

In [1]:
!pip install ultralytics
!pip install moviepy
!pip install -q supervision[assets] jupyter_bbox_widget

Collecting ultralytics
  Downloading ultralytics-8.3.57-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.57-py3-none-any.whl (905 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m905.3/905.3 kB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.57 ultralytics-thop-2.0.13
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.7/213.7 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m727.4/727.4 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

<h2>For reproducing the results, make sure to mount the correct drive, and then change the part according to the location of dataset</h2>

In [2]:
MODEL_NAME = 'yolo11n-seg.pt'
MAIN_INPUT_PATH = '/content/drive/MyDrive/Genuin Assignment/single_class_ip/2FgBOgck_K0.mp4'
ANNOTATED_OUTPUT_PATH = '/content/annotated_output_video.mp4'
MAIN_OUTPUT_PATH = '/content/output_video.mp4'
DETECTIONS_FILE_PATH = '/content/detections.jsonl'
DESTINATION_FOLDER = '/content/drive/MyDrive/Solutions/eleven.mp4'
OUTPUT_WIDTH = 1080
OUTPUT_HEIGHT = 1920

# Batch processing parameters
BATCH_SIZE = 128  # Reduced batch size for memory efficiency
TARGET_SIZE = (640, 640)

In [3]:
from ultralytics import YOLO
import supervision as sv
import cv2
import os
import numpy as np
import torch
import json
from pathlib import Path
import tempfile
from google.colab import drive
import shutil
from moviepy.editor import VideoFileClip

drive.mount('/content/drive')

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.





Mounted at /content/drive


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [5]:
obj = sv.VideoInfo.from_video_path(MAIN_INPUT_PATH)
width, height, fps, total_frames = obj.width, obj.height, obj.fps, obj.total_frames
print(width, height, fps, total_frames)

1280 720 23 625


In [6]:
model = YOLO(MODEL_NAME)
model.to(device)

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-seg.pt to 'yolo11n-seg.pt'...


100%|██████████| 5.90M/5.90M [00:00<00:00, 259MB/s]


YOLO(
  (model): SegmentationModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_runni

In [9]:
def save_detection(detection_data, file_path):
    with open(file_path, 'a') as f:
        json.dump(detection_data, f)
        f.write('\n')

def process_frame_batch(frames, indices, cap_props):
    width, height = int(cap_props['width']), int(cap_props['height'])
    processed_frames = []

    # Process each frame
    for frame in frames:
        # Resize and preprocess
        resized_frame = cv2.resize(frame, TARGET_SIZE)
        frame_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
        frame_normalized = frame_rgb.astype(np.float32) / 255.0
        processed_frames.append(frame_normalized)

    # Convert to tensor
    frames_array = np.stack(processed_frames)
    frames_tensor = torch.from_numpy(frames_array).permute(0, 3, 1, 2)

    # Free up memory
    del processed_frames, frames_array
    torch.cuda.empty_cache()

    # Run detection
    results = model(frames_tensor, stream=True)

    # Process results and save
    for idx, (result, orig_frame) in enumerate(zip(results, frames)):
        frame_num = indices[idx]

        # Scale factors
        scale_x = width / TARGET_SIZE[0]
        scale_y = height / TARGET_SIZE[1]

        # Create detection data
        frame_detections = {
            'frame_number': frame_num,
            'detections': []
        }

        if hasattr(result, 'boxes') and len(result.boxes) > 0:
            for box, cls in zip(result.boxes.xyxy, result.boxes.cls):
                # Scale coordinates
                scaled_box = [
                    float(box[0].item() * scale_x),
                    float(box[1].item() * scale_y),
                    float(box[2].item() * scale_x),
                    float(box[3].item() * scale_y)
                ]

                detection = {
                    'class': result.names[int(cls)],
                    'confidence': float(box.conf) if hasattr(box, 'conf') else None,
                    'bbox': scaled_box
                }

                frame_detections['detections'].append(detection)

        # Save detection data
        save_detection(frame_detections, DETECTIONS_FILE_PATH)

def process_video():
    # Open video capture
    cap = cv2.VideoCapture(MAIN_INPUT_PATH)

    # Get video properties
    cap_props = {
        'width': cap.get(cv2.CAP_PROP_FRAME_WIDTH),
        'height': cap.get(cv2.CAP_PROP_FRAME_HEIGHT),
        'fps': cap.get(cv2.CAP_PROP_FPS)
    }

    frames_batch = []
    frame_indices = []
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frames_batch.append(frame)
        frame_indices.append(frame_count)

        if len(frames_batch) == BATCH_SIZE:
            try:
              process_frame_batch(frames_batch, frame_indices, cap_props)
            except Exception as e:
                print(f"Error processing batch at frame {frame_count}: {str(e)}")

            # Clear batch
            frames_batch = []
            frame_indices = []
            torch.cuda.empty_cache()  # Clear CUDA cache

        frame_count += 1
        if frame_count % 10 == 0:
            print(f"Processed frame {frame_count}")

    # Process remaining frames
    if frames_batch:
        try:
          process_frame_batch(frames_batch, frame_indices, cap_props)
        except Exception as e:
            print(f"Error processing final batch: {str(e)}")

    # Release resources
    cap.release()

    print(f"Detections jsonl data saved")

def load_detections():
    detections = []
    with open(DETECTIONS_FILE_PATH, 'r') as f:
        for line in f:
            detections.append(json.loads(line))
    return detections

In [10]:
process_video()

# Example of loading and using the detection data
print("\nLoading detection data...")
all_detections = load_detections()
print(f"Total frames processed: {len(all_detections)}")

Processed frame 10
Processed frame 20
Processed frame 30
Processed frame 40
Processed frame 50
Processed frame 60
Processed frame 70
Processed frame 80
Processed frame 90
Processed frame 100
Processed frame 110
Processed frame 120

0: 640x640 2 persons, 6.3ms
1: 640x640 2 persons, 6.3ms
2: 640x640 2 persons, 6.3ms
3: 640x640 2 persons, 6.3ms
4: 640x640 2 persons, 6.3ms
5: 640x640 2 persons, 6.3ms
6: 640x640 2 persons, 6.3ms
7: 640x640 2 persons, 1 chair, 6.3ms
8: 640x640 2 persons, 1 chair, 6.3ms
9: 640x640 2 persons, 1 chair, 6.3ms
10: 640x640 2 persons, 1 chair, 6.3ms
11: 640x640 2 persons, 1 chair, 6.3ms
12: 640x640 2 persons, 1 chair, 6.3ms
13: 640x640 2 persons, 1 chair, 6.3ms
14: 640x640 2 persons, 6.3ms
15: 640x640 2 persons, 1 chair, 6.3ms
16: 640x640 2 persons, 1 chair, 6.3ms
17: 640x640 2 persons, 1 chair, 6.3ms
18: 640x640 2 persons, 1 chair, 6.3ms
19: 640x640 2 persons, 1 chair, 6.3ms
20: 640x640 2 persons, 1 chair, 6.3ms
21: 640x640 2 persons, 1 chair, 6.3ms
22: 640x640 2 

In [11]:
def transform_jsonl(input_file):
    transformed_data = []

    # Open and read the JSONL file
    with open(input_file, 'r') as file:
        for line in file:
            # Parse each line as JSON
            frame_data = json.loads(line.strip())
            frame_number = frame_data["frame_number"]
            bounding_boxes = [detection["bbox"] for detection in frame_data["detections"]]
            transformed_data.append({"frame_number": frame_number, "bounding_boxes": bounding_boxes})

    return transformed_data

detections_list = transform_jsonl(DETECTIONS_FILE_PATH)
# print(detections_list)

In [12]:
def calculate_916_crop(frame_data, image_width, image_height):
    # Extract bounding boxes from the frame data
    bounding_boxes = frame_data.get("bounding_boxes", [])

    # Define 9:16 target ratio
    target_ratio = 9 / 16

    # Determine fixed crop dimensions based on video size
    if image_width / image_height > target_ratio:
        # Height is limiting factor
        crop_height = image_height
        crop_width = crop_height * target_ratio
    else:
        # Width is limiting factor
        crop_width = image_width
        crop_height = crop_width / target_ratio

    # Add padding if desired (e.g., 10%)
    padding_factor = 1.1
    crop_width *= padding_factor
    crop_height *= padding_factor

    # Ensure crop doesn't exceed image dimensions
    crop_width = min(crop_width, image_width)
    crop_height = min(crop_height, image_height)

    # Handle bounding_boxes
    if not bounding_boxes:
        center_x, center_y = image_width / 2, image_height / 2
    else:
        # Find the largest object (by area) in the bounding boxes
        largest_box = max(
            bounding_boxes,
            key=lambda box: (box[2] - box[0]) * (box[3] - box[1])
        )
        box_width = largest_box[2] - largest_box[0]
        box_height = largest_box[3] - largest_box[1]

        # Determine if the box is too large for the crop dimensions
        if box_width > crop_width or box_height > crop_height:
            # Focus on the center of the box
            center_x = (largest_box[0] + largest_box[2]) / 2
            center_y = (largest_box[1] + largest_box[3]) / 2
        else:
            # Focus on the top-left corner of the box
            center_x = largest_box[0]
            center_y = largest_box[1]

    # Calculate initial crop coordinates
    x1 = max(0, center_x - crop_width / 2)
    y1 = max(0, center_y - crop_height / 2)
    x2 = x1 + crop_width
    y2 = y1 + crop_height

    # Adjust crop if it exceeds boundaries
    if x2 > image_width:
        x2 = image_width
        x1 = x2 - crop_width
    if y2 > image_height:
        y2 = image_height
        y1 = y2 - crop_height
    if x1 < 0:
        x1 = 0
        x2 = crop_width
    if y1 < 0:
        y1 = 0
        y2 = crop_height

    # Ensure crop coordinates stay within the image boundaries
    x1 = max(0, x1)
    y1 = max(0, y1)
    x2 = min(image_width, x2)
    y2 = min(image_height, y2)

    return [int(x1), int(y1), int(x2), int(y2)]


In [13]:
# def process_video_with_boxes(video_path, detections_list, output_path, smoothing_factor=0.95):
#     cap = cv2.VideoCapture(video_path)
#     width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
#     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
#     fps = int(cap.get(cv2.CAP_PROP_FPS))

#     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
#     out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

#     frame_count = 0
#     color = (0, 255, 0)
#     thickness = 2

#     # Initialize previous crop coordinates
#     previous_crop_coords = None

#     while cap.isOpened():
#         ret, frame = cap.read()
#         if not ret:
#             break

#         if frame_count < len(detections_list):
#             boxes = detections_list[frame_count]
#             # Calculate current crop coordinates
#             current_crop_coords = calculate_916_crop(boxes, width, height)

#             if previous_crop_coords is None:
#                 # First frame, no smoothing applied
#                 smoothed_crop_coords = current_crop_coords
#             else:
#                 # Apply exponential smoothing
#                 smoothed_crop_coords = [
#                     int(previous_crop_coords[i] * smoothing_factor + current_crop_coords[i] * (1 - smoothing_factor))
#                     for i in range(4)
#                 ]

#             # Draw the smoothed crop rectangle on the frame
#             cv2.rectangle(
#                 frame,
#                 (smoothed_crop_coords[0], smoothed_crop_coords[1]),
#                 (smoothed_crop_coords[2], smoothed_crop_coords[3]),
#                 color, thickness
#             )

#             # Update the previous crop coordinates
#             previous_crop_coords = smoothed_crop_coords

#         out.write(frame)
#         frame_count += 1

#     cap.release()
#     out.release()

# # Make sure `detections_list` and `calculate_916_crop` are properly defined
# process_video_with_boxes(MAIN_INPUT_PATH, detections_list, ANNOTATED_OUTPUT_PATH)

In [20]:
def cut_video_with_smoothing(video_path, detections_list, output_path, output_width=1080, output_height=1920, smoothing_factor=0.95):
    cap = cv2.VideoCapture(video_path)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (output_width, output_height))

    frame_count = 0
    color = (0, 255, 0)
    thickness = 2

    # Initialize previous crop coordinates
    previous_crop_coords = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count < len(detections_list):
            boxes = detections_list[frame_count]
            # Calculate current crop coordinates
            current_crop_coords = calculate_916_crop(boxes, width, height)

            if previous_crop_coords is None:
                # First frame, no smoothing applied
                smoothed_crop_coords = current_crop_coords
            else:
                # Apply exponential smoothing
                smoothed_crop_coords = [
                    int(previous_crop_coords[i] * smoothing_factor + current_crop_coords[i] * (1 - smoothing_factor))
                    for i in range(4)
                ]

            # Crop the frame using the smoothed coordinates (adjusted to 9:16 aspect ratio)
            cropped_frame = frame[smoothed_crop_coords[1]:smoothed_crop_coords[3], smoothed_crop_coords[0]:smoothed_crop_coords[2]]

            # Resize the cropped frame to the desired 9:16 aspect ratio (output_width x output_height)
            cropped_frame_resized = cv2.resize(cropped_frame, (output_width, output_height))

            # Write the resized cropped frame to the output video
            out.write(cropped_frame_resized)

            # Update the previous crop coordinates
            previous_crop_coords = smoothed_crop_coords

        frame_count += 1

    cap.release()
    out.release()

cut_video_with_smoothing(MAIN_INPUT_PATH, detections_list, MAIN_OUTPUT_PATH)

In [18]:
def add_sound_to_cuted_video(original_video_path, new_video_path, output_path):
    try:
        # Load the original video to extract its audio
        original_video = VideoFileClip(original_video_path)
        if original_video.audio is None:
            raise ValueError("Original video has no audio track")
        original_audio = original_video.audio

        # Load the edited video
        edited_video = VideoFileClip(new_video_path)

        # Get durations
        original_audio_duration = original_audio.duration
        edited_video_duration = edited_video.duration

        # Handle cases based on video lengths
        if original_audio_duration >= edited_video_duration:
            # Trim the original audio to match the edited video
            trimmed_audio = original_audio.subclip(0, edited_video_duration)
            final_video = edited_video.set_audio(trimmed_audio)
        else:
            # Cut the edited video to match the original audio duration
            trimmed_video = edited_video.subclip(0, original_audio_duration)
            final_video = trimmed_video.set_audio(original_audio)

        # Write to a temporary file first
        temp_output_path = output_path + "_temp.mp4"
        final_video.write_videofile(
            temp_output_path,
            codec="libx264",
            audio_codec="aac",
            temp_audiofile="temp-audio.m4a",
            remove_temp=True,
            audio=True,  # Explicitly enable audio
            ffmpeg_params=["-strict", "-2"]  # Add FFmpeg parameters for better compatibility
        )

        # Replace the original file after successful export
        os.replace(temp_output_path, output_path)

        # Clean up by closing the video files
        original_video.close()
        edited_video.close()
        final_video.close()

    except Exception as e:
        print(f"Error processing video: {str(e)}")
        # Clean up if there's an error
        if 'original_video' in locals(): original_video.close()
        if 'edited_video' in locals(): edited_video.close()
        if 'final_video' in locals(): final_video.close()
        if os.path.exists(temp_output_path): os.remove(temp_output_path)
        raise

add_sound_to_cuted_video(MAIN_INPUT_PATH, MAIN_OUTPUT_PATH, MAIN_OUTPUT_PATH)

Moviepy - Building video /content/output_video.mp4_temp.mp4.
MoviePy - Writing audio in temp-audio.m4a




MoviePy - Done.
Moviepy - Writing video /content/output_video.mp4_temp.mp4





Moviepy - Done !
Moviepy - video ready /content/output_video.mp4_temp.mp4


In [19]:
shutil.move(MAIN_OUTPUT_PATH, DESTINATION_FOLDER)

'/content/drive/MyDrive/Solutions/eleven.mp4'