# Create PDF of Video File (Images and Transcript)

## Overview

Users in many cases need to import video fales into the RAG (Retrieval and Augmentation) but first need to convert the video file into a format that can be ingested into the RAG.

## Problem Statement

RAG solutions such as AWS Bedrock only support certain file formats when importing documents. Furthermore, embed models do not understand video files natively. Users need to convert video files into formats compatible with the embed models.

> Note: it's important to use an embed model that is multimodal, i.e. that they understand both images and text.

## Solution

The (current) best approach is to create a PDF that has images and text extracted from the video file. The PDF is the recommended format since it can have both images and text. Also, a wide variety of RAG solutions support ingesting PDF files.

This notebook's code helps users:

1. Load video files
2. Extract audio from the video files
3. Transcribe the audio as text
4. Create images from the video file
5. Create a PDF that combines the images and audio transcription

### Extract Audio from Video

In [None]:
from moviepy import VideoFileClip


# Define file paths
audio_path = "output/audio/output_audio.wav"
frame_path = "output/frame"
pdf_path = "output/pdf/output_pdf.pdf"
transcript_path = "output/transcript/output_audio_transcript.txt"
video_path = "input/video/activity_editor_overview.mp4"


# Load the video file
video = VideoFileClip(video_path)

# Extract and save the audio
audio = video.audio
audio.write_audiofile(audio_path)

video.close()

print(f"Audio saved to {audio_path}")

### Transcribe Audio to Text

In [None]:
import whisper


# Load the Whisper model
model = whisper.load_model("base")

# Transcribe the audio file
result = model.transcribe(audio_path)

# Access the transcribed text
transcript = result['text']

# Save the transcription to a file
with open(transcript_path, "w", encoding="utf-8") as file:
    file.write(transcript)

print(f"Transcription saved to {transcript_path}")

### Create Images from the Video

In [None]:
import cv2

# Open the video file
cap = cv2.VideoCapture(video_path)

frame_rate = cap.get(cv2.CAP_PROP_FPS)  # Get the frame rate
interval = int(frame_rate)  # Capture one frame per second

frame_count = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    if frame_count % interval == 0:
        frame_name = f"{frame_path}/frame_{frame_count // interval:04d}.jpg"
        cv2.imwrite(frame_name, frame)
        print(f"Creating frame {frame_name}")
    frame_count += 1

cap.release()

### Create PDF with Images and Transcript

In [47]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Image, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
import os


def create_pdf_with_transcript_and_images(transcript_path, frames_path, pdf_path):
    # Read the transcript
    with open(transcript_path, 'r') as file:
        transcript = file.read()

    # Create a PDF document
    doc = SimpleDocTemplate(pdf_path, pagesize=letter)
    elements = []

    # Get a sample style sheet
    styles = getSampleStyleSheet()
    normal_style = styles['Normal']

    # Load frames
    frame_files = sorted(f for f in os.listdir(frames_path) if f.endswith('.jpg'))

    # Split transcript into segments for each frame
    transcript_lines = transcript.split('\n')
    lines_per_frame = len(transcript_lines) // len(frame_files) + 1

    for i, frame_file in enumerate(frame_files):
        frame_path = os.path.join(frames_path, frame_file)
        img = Image(frame_path)
        img.drawHeight = 3 * inch
        img.drawWidth = 4 * inch
        elements.append(img)
        elements.append(Spacer(1, 12))

        # Add corresponding transcript segment
        start_line = i * lines_per_frame
        end_line = start_line + lines_per_frame
        segment = '\n'.join(transcript_lines[start_line:end_line])
        para = Paragraph(segment, normal_style)
        elements.append(para)
        elements.append(Spacer(1, 12))

    # Build the PDF document
    doc.build(elements)

create_pdf_with_transcript_and_images(transcript_path, frame_path, pdf_path)

print(f"PDF saved to {pdf_path}")

PDF saved to output/pdf/output_pdf.pdf
