In [5]:
from openai import OpenAI
from IPython.display import display, Image, Audio
import cv2
import base64
import time

client = OpenAI()

base64Frames = []
video = cv2.VideoCapture("./videos/output.mp4")
while video.isOpened():
    success, frame = video.read()
    if not success:
        break
    _, buffer = cv2.imencode(".jpg", frame)
    base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

# check loading of video
display_handle = display(None, display_id=True)
print(f"Length of image array: {len(base64Frames)}")
for img in base64Frames:
    display_handle.update(Image(data=base64.b64decode(img.encode("utf-8")), width=600))
    time.sleep(1/48)
reducedFrames = base64Frames[0::4]
print(f"Processing {len(reducedFrames)} frames...")

Processing 22 frames...


In [6]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

prompt = """
These are frames from a video that I want to upload. 
Can you make a recap of what happens in the video, taking note of major changes of events in the video. 
Be detailed as to create an audio descriptive aid. Speak naturally and in a way that would be helpful to someone who is visually impaired.
Minimize the number of seperate Time events you use, which means grouping together events as much as possible.
Attempt to interpret the events in the video where possible, but do not provide or use any information that is not explicitly present in the video.
The current sound effect detected is provided at the top left of each frame, use this to help describe events but do not mention the provision of this context.
The current timestamp is also provided in the top left of each frame.
Do not provide an overall summary.
Your output should be only the content of a .srt file.
For each caption in the srt, it is very important that it is able to be spoken within the timestamps you provide such that it can be read out loud within the duration of the timestamp.
It is of upmost importance that the captions are accurate, the timings do not overlap, and that the captions are within the duration of the video.
"""
PROMPT_MESSAGES = [
    {
        "role": "user",
        "content": [
            prompt,
            *map(lambda x: {"image": x, "resize": 300}, reducedFrames),
        ],
    },
]
params = {
    "model": "gpt-4o-mini",
    "messages": PROMPT_MESSAGES,
}

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def generate_with_backoff(**kwargs):
    result = client.chat.completions.create(**kwargs)
    print(f"Completion tokens used: {result.usage.completion_tokens}")
    print(f"Prompt tokens used: {result.usage.prompt_tokens}")
    print(f"Total tokens used: {result.usage.total_tokens}")
    return result

result = generate_with_backoff(**params)

print(result.choices[0].message.content)
with open('output_subtitles.srt', 'w') as f:
    f.write(result.choices[0].message.content)

Completion tokens used: 586
Prompt tokens used: 81106
Total tokens used: 81692
```srt
1
00:00:00,000 --> 00:00:01,333
A girl sits on a bed, focused on her laptop. 

2
00:00:01,333 --> 00:00:02,666
Raindrops begin to pour against the window. 

3
00:00:02,666 --> 00:00:04,000
Suddenly, a loud explosion is heard. 

4
00:00:04,000 --> 00:00:05,333
She turns around, startled by the noise. 

5
00:00:05,333 --> 00:00:06,666
The rain continues to fall as thunder rumbles. 

6
00:00:06,666 --> 00:00:09,000
The girl stands up, walking towards the window. 

7
00:00:09,000 --> 00:00:10,866
Outside, the rain pours heavily onto the plants. 

8
00:00:10,866 --> 00:00:12,000
She seems captivated by the view. 

9
00:00:12,000 --> 00:00:13,333
A close-up shows her smiling gently. 

10
00:00:13,333 --> 00:00:14,666
She appears to enjoy being in the rain. 

11
00:00:14,666 --> 00:00:16,000
Next, she's shown putting on a raincoat. 

12
00:00:16,000 --> 00:00:17,333
The sounds of the rain persist. 

13
00:00