<a href="https://colab.research.google.com/github/NUMAIRn/AI-Video-Caption-Generation/blob/main/AI_Generated_Captions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import numpy as np
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

# Load your pre-trained model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Settings for caption generation
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

# Function to generate captions for frames
def predict_step(images):
    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    output_ids = model.generate(pixel_values, **gen_kwargs)
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

# Function to overlay caption on a frame with text wrapping
def overlay_caption_on_frame(frame, caption):
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5  # Start with a small font scale
    font_color = (255, 255, 255)
    font_thickness = 2
    background_color = (0, 0, 0)  # Black background for the text

    frame_height, frame_width = frame.shape[:2]

    # Split text into multiple lines if it is too long
    wrapped_caption = []
    max_width = frame_width - 40  # Allow some padding

    words = caption.split(' ')
    line = ""
    for word in words:
        test_line = line + word + " "
        text_size = cv2.getTextSize(test_line, font, font_scale, font_thickness)[0]
        if text_size[0] < max_width:
            line = test_line
        else:
            wrapped_caption.append(line.strip())
            line = word + " "
    wrapped_caption.append(line.strip())

    # Dynamically adjust font size based on frame size
    while True:
        text_size = cv2.getTextSize(wrapped_caption[0], font, font_scale, font_thickness)[0]
        if text_size[0] <= max_width:
            break
        font_scale -= 0.1  # Decrease font size if text doesn't fit

    # Positioning text: start from the bottom of the frame
    y_offset = frame_height - 30 * len(wrapped_caption)

    for line in wrapped_caption:
        text_size = cv2.getTextSize(line, font, font_scale, font_thickness)[0]
        text_x = (frame_width - text_size[0]) // 2
        text_y = y_offset + text_size[1]

        # Create a background rectangle for text visibility
        cv2.rectangle(frame, (text_x - 10, text_y - text_size[1] - 10),
                      (text_x + text_size[0] + 10, text_y + 10), background_color, cv2.FILLED)
        cv2.putText(frame, line, (text_x, text_y), font, font_scale, font_color, font_thickness)

        y_offset += text_size[1] + 15  # Move to the next line

# Main function to process the video
def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Define the codec and create VideoWriter object for the output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

    frame_count = 0
    images_to_caption = []
    captioned_frames_indices = []

    # Process the video frame by frame to capture the first frame of each second
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Capture first frame of each second
        if frame_count % fps == 0:
            # Convert frame (numpy array) to PIL image
            pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            images_to_caption.append(pil_image)
            captioned_frames_indices.append(frame_count)

        frame_count += 1

    # Generate captions for the selected frames
    captions = predict_step(images_to_caption)

    # Reset to process the video again and write each frame back with the appropriate caption
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
    frame_count = 0
    caption_index = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Check if it's time to apply a new caption (every second)
        if frame_count % fps == 0 and caption_index < len(captions):
            current_caption = captions[caption_index]
            caption_index += 1

        # Apply the current caption to every frame in the second
        overlay_caption_on_frame(frame, current_caption)

        # Write the frame (with the current caption) to the output video
        out.write(frame)
        frame_count += 1

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# Call the function
process_video('input_video_filepath.mp4', 'output_video_with_captions.mp4')
