# **Install Required Packages**

In [None]:
%pip install -q yt-dlp supervision openai numpy opencv-python elevenlabs ffmpeg-python python-dotenv

# **Settings**

In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Get the current working directory (this will be your notebook's directory)
cwd = Path(os.getcwd())

# Construct the path to the .env file. Adjust the path as per your directory structure.
# If your .env file is one directory above your notebook's directory:
env_path = cwd.parent / '.env'

# Load the .env file
load_dotenv(dotenv_path=env_path)

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"

VIDEO_URL = 'https://www.youtube.com/watch?v=GW9YZcn8Tik'
FRAME_EXTRACTION_FREQUENCY_SECONDS = 4
FILES_DIR = "./files/"

# **Imports**

In [None]:
import re
import cv2
import math
import base64
import requests
import subprocess
import numpy as np
import supervision as sv

from shlex import quote
from openai import OpenAI
from elevenlabs import set_api_key, generate, save

# **Function Definitions**

In [None]:
def format_video_title(title: str) -> str:
    # Regular expressions as constants for clarity
    REMOVE_NON_ALPHANUMERIC = r'[^\w\s-]'
    REPLACE_SPACES_AND_DASHES = r'[-\s]+'

    # Removing non-alphanumeric characters (except spaces and dashes) and lowercasing
    sanitized_title = re.sub(REMOVE_NON_ALPHANUMERIC, '', title.lower())
    # Replacing spaces and consecutive dashes with a single dash
    formatted_title = re.sub(REPLACE_SPACES_AND_DASHES, '-', sanitized_title)

    return formatted_title

def download_youtube_video(url: str, output_path: str = '.') -> str:
    os.makedirs(output_path, exist_ok=True)
    safe_url = quote(url)
    command_title = f'yt-dlp --get-title {safe_url}'
    result_title = subprocess.run(command_title, shell=True, check=True, text=True, capture_output=True)
    formatted_title = format_video_title(result_title.stdout.strip())
    command_download = f'yt-dlp -o "{os.path.join(output_path, formatted_title)}.%(ext)s" -f "best[ext=mp4]" {safe_url}'
    result_download = subprocess.run(command_download, shell=True, check=True, text=True, capture_output=True)

    if result_download.returncode == 0:
        return os.path.join(output_path, f"{formatted_title}.mp4")
    else:
        raise Exception(f"Error in video download: {result_download.stderr}")

def encode_image_to_base64(image: np.ndarray) -> str:
    success, buffer = cv2.imencode('.jpg', image)
    if not success:
        raise ValueError("Could not encode image to JPEG format.")
    return base64.b64encode(buffer).decode('utf-8')

def compose_payload(images: list, prompt: str) -> dict:
    image_content = [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image_to_base64(image=image)}"}} for image in images]
    return {
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type": "text", "text": prompt}] + image_content}],
        "max_tokens": 200
    }

def compose_headers(api_key: str) -> dict:
    return {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

def prompt_image(api_key: str, images: list, prompt: str) -> str:
    headers = compose_headers(api_key=api_key)
    payload = compose_payload(images=images, prompt=prompt)
    response = requests.post(url=OPENAI_API_URL, headers=headers, json=payload).json()
    
    if 'error' in response:
        raise ValueError(response['error']['message'])
    
    return response['choices'][0]['message']['content']

def optimal_grid_size(total_images):
    if total_images <= 0:
        return 0
    grid_size = math.ceil(math.sqrt(total_images))
    return grid_size

## **Save Video and Extract Frames**

In [None]:
video_path = download_youtube_video(VIDEO_URL, FILES_DIR)
video_info = sv.VideoInfo.from_video_path(video_path=video_path)
frame_extraction_frequency = FRAME_EXTRACTION_FREQUENCY_SECONDS * video_info.fps
frames = list(sv.get_video_frames_generator(source_path=video_path, stride=frame_extraction_frequency))

grid_size = optimal_grid_size(len(frames))
sv.plot_images_grid(frames, grid_size=(grid_size, grid_size), size=(16, 16))

# **Generate Description from Frames and LLM API**

In [None]:
PROMPT = ("The uploaded series of images is from a single video sampled every {} seconds. "
          "Make sure it takes about {} seconds to voice the description of each frame. "
          "Use exclamation points and capital letters to express excitement if necessary. "
          "Your responses are spelled out according to how they sound with the Boston Accent. "
          "You like to use phrases related to Boston. "
          "Briefly Describe the video as a concise and excited play-by-play caller in the style of Gus Johnson. "
          "Do not identify the frame within the description.").format(FRAME_EXTRACTION_FREQUENCY_SECONDS, FRAME_EXTRACTION_FREQUENCY_SECONDS)

description = prompt_image(OPENAI_API_KEY, frames, PROMPT)
print(description)

# **Create Audio via Text-to-Speech API**

In [None]:
XI_API_KEY = os.environ.get('XI_API_KEY')
set_api_key(XI_API_KEY)
audio = generate(text=description, voice=os.environ.get("VOICE_MARKY"))
file_name = os.path.splitext(os.path.basename(video_path))[0]
save(audio, f"{FILES_DIR}{file_name}.mp3")

# **Combine Audio and Video**

In [None]:
audio_path = f"{FILES_DIR}{file_name}.mp3"
output_path = f"{FILES_DIR}{file_name}_final.mp4"
!/opt/homebrew/bin/ffmpeg -y -i {video_path} -i {audio_path} -c:v copy -c:a aac -strict experimental -map 0:v:0 -map 1:a:0 {output_path}