In [83]:
import base64
from openai import OpenAI
from crewai_tools import BaseTool
import cv2
from tqdm import tqdm

In [None]:
def solve(question, vid_path):
    # Stuff
    client = OpenAI(api_key="<OpenAI API Key>")

    # Function to encode the image
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
        
    # Extract frame
    class FrameExtractTool(BaseTool):
        name: str = "Frame Extraction Tool"
        description: str = (
            "This tool extracts a specific frame from a given video file based on the frame ID "
            "and returns the frame as a base64-encoded image. This can be useful for extracting "
            "individual frames from a video for further analysis, display, or processing."
        )

        def _run(self, frame_id: int, video_path: str) -> str:
            """
            Extracts a specific frame from the video and returns the frame as a base64-encoded image.

            Args:
                frame_id (int): The index of the frame to be extracted.
                video_path (str): The path to the video file.

            Returns:
                str: A base64-encoded string representing the extracted frame.
            """
            # Open the video file
            cap = cv2.VideoCapture(video_path)
            
            # Check if the video opened successfully
            if not cap.isOpened():
                raise ValueError(f"Error opening video file at {video_path}.")
            
            # Set the video capture position to the frame ID
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
            
            # Read the frame
            ret, frame = cap.read()
            
            if not ret:
                raise ValueError(f"Could not read frame {frame_id}.")
            
            # Release the video capture object
            cap.release()
            # plt.imshow(frame)
            # Convert the frame (BGR format) to JPEG and encode it as base64
            _, buffer = cv2.imencode('.jpg', frame)  # Convert to JPEG format
            base64_image = base64.b64encode(buffer).decode('utf-8')  # Convert to base64 string
            
            return base64_image
        
    fet=FrameExtractTool()

    # # Path to your image
    # vid_path = "D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00001.mp4"
    # question = "Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass."
    # # Getting the Base64 string

    # Create frame array
    frame_array = [fet._run(i, vid_path) for i in range(0, 180, 18)]
    # Get response
    def getResponse(base64_image):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            # "text": "Get information from the image that answers this question. Describe the details which you looked at. Question - {question}",
                            "text": "Describe this image and give details which may help answer this question. If there are no details that may help, return a single period as output. Question - {question}",
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}","detail":"high"},
                            
                        },
                    ],
                }
            ],
            max_tokens=200,
        )

        return response.choices[0].message.content
    # Generate all responses
    print('Begin generating responses')
    responses = []
    for el in tqdm(frame_array):
        responses.append(getResponse(el))
    # Final response    
    finalresponse = ''
    for el in responses:
        finalresponse += el
        finalresponse += '\n'
        
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "developer", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": f"Answer this multiple choice question (with only the correct option as output ex. A or B or C) - {question} based on the text information - {finalresponse}",
            }
        ]
    )

    return completion.choices[0].message.content, responses

In [91]:
# vid_path = "D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00001.mp4"
# question = "Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass."
vid_path = "D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00002.mp4"
question = "Where can ego legally park on this street? A. No parking anywhere. B. next to right curb. C. anywhere. D. next to left curb."

In [92]:
ans, vals = solve(question, vid_path)

Begin generating responses


100%|██████████| 10/10 [02:40<00:00, 16.07s/it]


In [1]:
ans

NameError: name 'ans' is not defined