# 2 LLM 'Multiple Drafts Model'


In [60]:
from crewai import Agent, Task, Crew

In [72]:
import os
os.environ["OPENAI_API_KEY"] = "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"
os.environ["OPENAI_MODEL_NAME"]="gpt-4o-mini"

## Creating Agents

In [73]:
manager = Agent(
    role="Manager",
    goal="Responsible for processing user queries about a video. It checks the gathered memory to see if an answer can be formed directly or decides which frame to process",
    backstory=(
        "As a Manager Agent, your task is to handle user queries related to a video. First, check the stored memory to determine if the query can be answered based on previously extracted information. If additional insights are required, decide which specific frame (between 0 and 179) should be analyzed to obtain relevant details. You can retrive a maximum of 20 frames. Request the Image Understanding Agent to analyze the chosen frame. If necessary, retrieve stored insights from the Memory Retrieval Tool before constructing a final response. This is the question - {Question}. This is the video - {Video}."
    ),
    allow_delegation=True,
    verbose=True
)

In [74]:
imgAgent = Agent(
    role="Image Understanding Agent",
    goal="Responsible for analyzing a specified frame in the video and extracting key information that may help answer a query",
    backstory=(
        "As an Image Understanding Agent, your task is to analyze a specific video frame as requested by the Manager Agent. Identify and extract relevant details, including objects, actions, traffic signs, pedestrians, and vehicles, based on the given query. Once the analysis is complete, store the extracted information in memory for future reference and communicate your findings back to the Manager Agent. This is the question - {Question}"
    ),
    allow_delegation=False,
    verbose=True
)

In [75]:
from crewai_tools import BaseTool
import cv2
import base64
import matplotlib.pyplot as plt

class FrameExtractTool(BaseTool):
    name: str = "Frame Extraction Tool"
    description: str = (
        "This tool extracts a specific frame from a given video file based on the frame ID "
        "and returns the frame as a base64-encoded image. This can be useful for extracting "
        "individual frames from a video for further analysis, display, or processing."
    )

    def _run(self, frame_id: int, video_path: str) -> str:
        """
        Extracts a specific frame from the video and returns the frame as a base64-encoded image.

        Args:
            frame_id (int): The index of the frame to be extracted.
            video_path (str): The path to the video file.

        Returns:
            str: A base64-encoded string representing the extracted frame.
        """
        # Open the video file
        cap = cv2.VideoCapture(video_path)
        
        # Check if the video opened successfully
        if not cap.isOpened():
            raise ValueError(f"Error opening video file at {video_path}.")
        
        # Set the video capture position to the frame ID
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
        
        # Read the frame
        ret, frame = cap.read()
        
        if not ret:
            raise ValueError(f"Could not read frame {frame_id}.")
        
        # Release the video capture object
        cap.release()
        # plt.imshow(frame)
        # Convert the frame (BGR format) to JPEG and encode it as base64
        _, buffer = cv2.imencode('.jpg', frame)  # Convert to JPEG format
        base64_image = base64.b64encode(buffer).decode('utf-8')  # Convert to base64 string
        
        return base64_image
    
fet=FrameExtractTool()

## Creating Tasks


In [76]:
manageTask = Task(
    description=(
        "You are responsible for handling user queries related to a video by checking previously stored memory. If enough information exists, answer the query directly. If more details are required, determine which frame (0-179) to analyze for further insights. You can only retrive maximum of 20 frames. Retrive the frame and request the Image Understanding Agent to analyze the frame and store its findings in memory. This is the question - {Question}. This is the video - {Video}."
    ),
    expected_output=(
        "If memory has sufficient information: 'Answer to the query based on existing knowledge.' If additional analysis is required: 'Identified frame number (0-179) for further analysis.' 'Request sent to Image Understanding Agent to analyze the chosen frame.' If stored insights need to be retrieved: 'Request sent to Memory Retrieval Tool for relevant insights"
    ),
    agent=manager,
    tools=[fet]
)

In [77]:
imgTask = Task(
    description=(
        "You are responsible for analyzing a specific video frame and extracting relevant information to answer a user query. Given a frame number and query context, identify and describe key details such as objects, actions, traffic signs, pedestrians, and vehicles. Store the extracted insights in memory for future use and send your findings to the Manager Agent. This is the question - {Question}"
    ),
    expected_output=(
        "Extracted insights from the specified frame, including objects, scene details, and actions. Stored memory update for future reference. Response sent to Manager Agent with extracted details."
    ),
    agent=imgAgent
)

## Creating the Crew

In [78]:
crew = Crew(
    agents=[manager, imgAgent],
    tasks=[manageTask, imgTask],
    verbose=2,
    memory=True
)



## Running the Crew

**Note**: LLMs can provide different outputs for they same input

In [79]:
inputs = {
    "Question":"Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.",
    "Video":"D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00001.mp4"
    }

result = crew.kickoff(inputs=inputs)

[1m[95m [DEBUG]: == Working Agent: Manager[00m
[1m[95m [INFO]: == Starting Task: You are responsible for handling user queries related to a video by checking previously stored memory. If enough information exists, answer the query directly. If more details are required, determine which frame (0-179) to analyze for further insights. You can only retrive maximum of 20 frames. Retrive the frame and request the Image Understanding Agent to analyze the frame and store its findings in memory. This is the question - Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.. This is the video - D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00001.mp4.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4o-mini in organization org-mpGUm2qVvjgIXltJxiYpFrU4 on tokens per min (TPM): Limit 60000, Requested 285493. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

- Display the final result as Markdown.

In [22]:
from IPython.display import Markdown
Markdown(result)

After giving 20 apples to my sister and 10 apples to my brother, I am left with 70 apples. So, I have 70 apples left.