In [3]:
from langchain_groq import ChatGroq

In [4]:
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_groq import ChatGroq
store = {}  # memory is maintained outside the chain

def get_session_history(session_id: str) -> InMemoryChatMessageHistory:
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]

llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=api_key, max_tokens=100)
# llm = ChatGroq(groq_api_key=api_key, model_name="llama-3.2-11b-vision-preview")

In [5]:
import base64
import cv2

class FrameExtractTool():
    name: str = "Frame Extraction Tool"
    description: str = (
        "This tool extracts a specific frame from a given video file based on the frame ID "
        "and returns the frame as a base64-encoded image. This can be useful for extracting "
        "individual frames from a video for further analysis, display, or processing."
    )

    def _run(self, frame_id: int, video_path: str) -> str:
        """
        Extracts a specific frame from the video and returns the frame as a base64-encoded image.

        Args:
            frame_id (int): The index of the frame to be extracted.
            video_path (str): The path to the video file.

        Returns:
            str: A base64-encoded string representing the extracted frame.
        """
        # Open the video file
        cap = cv2.VideoCapture(video_path)
        
        # Check if the video opened successfully
        if not cap.isOpened():
            raise ValueError(f"Error opening video file at {video_path}.")
        
        # Set the video capture position to the frame ID
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id)
        
        # Read the frame
        ret, frame = cap.read()
        
        if not ret:
            raise ValueError(f"Could not read frame {frame_id}.")
        
        # Release the video capture object
        cap.release()
        # plt.imshow(frame)
        # Convert the frame (BGR format) to JPEG and encode it as base64
        _, buffer = cv2.imencode('.jpg', frame)  # Convert to JPEG format
        base64_image = base64.b64encode(buffer).decode('utf-8')  # Convert to base64 string
        
        return base64_image
    
fet=FrameExtractTool()
# vid_path = "D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00001.mp4"
# question = "Where can ego legally park on this street? A. No parking anywhere. B. next to right curb. C. anywhere. D. next to left curb."

In [6]:
# first_prompt=f"""You are an intelligent assistant designed to help a fully autonomous vehicle make decisions based on video frame inputs and multiple-choice questions. For the given video frames, you need to analyse the current situation, and then based on the multiple choice question, select the best course of action from the multiple-choice options.

# I will provide you successive image frames turn by turn. After you analyze all the frames, u have to come up with the correct answer to the multiple choice question.

# Multiple Choice question: {question}
# As a strategy, generate a checklist of requirements you need to observe in the images to answer the question confidently, for ex. keeping track of the number of cars entering the frame or the signboards encountered.
# Then for each turn where I provide the images maintain a summary of the observations you made in the checklist and try to approach an answer.
# Try to keep it as brief as possible and try not to lose any information. Use phrases in place of sentences if necessary.
# """

# final_prompt=f"""Now using the knowledge you have gained answer the multiple choice question: {question} with the most probable answer. If you cant see a signboard or any information seems unclear assume it affects our question and make decisions accordingly. Keep in mind only to output the exact option letter and nothing else"""

In [7]:
# first_message = HumanMessage(
#         content=[
#             {"type": "text", "text": first_prompt}
#         ],
#     )
# final_message = HumanMessage(
#         content=[
#             {"type": "text", "text": final_prompt}
#         ],
#     )

In [8]:
def get_i_frame(i, vid_path):
    image_dat=fet._run(i, vid_path)
    message = HumanMessage(
        role="user",
        content=[
            {"type": "text", "text": "Now I'm gonna provide next image in sequence. Answer the question based on the images provided."},
            {
                "type": "image_url",
                "image_url": {"url":f"data:image/jpeg;base64,{image_dat}"},
                
            },
        ]
    )
    # print(message.keys())
    return message

In [9]:
def run_chain_once(chain, message, config):
    response = chain.invoke(
        message,
        config=config,
    )
    return response
# chain.invoke(
#     first_prompt,
#     # message,
#     config={"configurable": {"session_id": "1"}},
# )  # session_id determines thread
# run_chain_once(chain, first_message, config)
# run_chain_once(chain, get_i_frame(-3), config)
# run_chain_once(chain, final_message, config)

In [10]:
from tqdm import tqdm
def solveqn(vid_path, question):
    frames_to_check=list(range(0, 180, 36))

    first_prompt=f"""You are an intelligent assistant designed to help a fully autonomous vehicle make decisions based on video frame inputs and multiple-choice questions. For the given video frames, you need to analyse the current situation, and then based on the multiple choice question, select the best course of action from the multiple-choice options.

    I will provide you successive image frames turn by turn. After you analyze all the frames, u have to come up with the correct answer to the multiple choice question.

    Multiple Choice question: {question}
    As a strategy, generate a checklist of requirements you need to observe in the images to answer the question confidently, for ex. keeping track of the number of cars entering the frame or the signboards encountered.
    Then for each turn where I provide the images maintain a summary of the observations you made in the checklist and try to approach an answer.
    Try to keep it as brief as possible and try not to lose any information. Use phrases in place of sentences if necessary.
    """

    final_prompt=f"""Now using the knowledge you have gained answer the multiple choice question: {question} with the most probable answer. If you cant see a signboard or any information seems unclear assume it affects our question and make decisions accordingly. Keep in mind only to output the exact option letter and nothing else"""

    first_message = HumanMessage(
        content=[
            {"type": "text", "text": first_prompt}
        ],
    )
    final_message = HumanMessage(
        content=[
            {"type": "text", "text": final_prompt}
        ],
    )

    chain = RunnableWithMessageHistory(llm, get_session_history)
    config={"configurable": {"session_id": "1"}}
    run_chain_once(chain, first_message, config)
    for i in tqdm(frames_to_check):
        # image_dat=fet._run(i, vid_path)
        
        run_chain_once(chain, get_i_frame(0, vid_path=vid_path), config)

    return run_chain_once(chain, final_message, config)


In [None]:
# solveqn("D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00002.mp4", "Where can ego legally park on this street? A. No parking anywhere. B. next to right curb. C. anywhere. D. next to left curb.")
# solveqn("D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00050.mp4", "When the light turns green, can ego traverse straight through the intersection and why? A. Yes, this is a legal maneuver. B. No, there is construction ahead. C. No, there is a sign that says yield to pedestrians. D. No, there is a sign that says all traffic must turn.")
solveqn("D:/tree/tesla/tesla-real-world-video-q-a/videos/videos/00049.mp4", 

100%|██████████| 5/5 [00:45<00:00,  9.18s/it]


AIMessage(content='D', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 2, 'prompt_tokens': 5892, 'total_tokens': 5894, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 5632}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_13eed4fce1', 'finish_reason': 'stop', 'logprobs': None}, id='run-077a69f5-0471-42ef-909a-135a3ce6617b-0', usage_metadata={'input_tokens': 5892, 'output_tokens': 2, 'total_tokens': 5894, 'input_token_details': {'audio': 0, 'cache_read': 5632}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [22]:
import pandas as pd

# Load the dataset
df = pd.read_csv("questions.csv")  # Replace with the actual path to your CSV
start=41
end=50
# Ensure id is treated as an integer to sort in descending order
df["id"] = df["id"].astype(int)
df = df.sort_values(by="id")
df=df[start:end]

In [None]:

# Define the folder path where videos are stored
folder_path = "./videos/videos"  # Replace with actual folder path

# Output storage
output_data = []

# Iterate over dataset from highest id to lowest
for _, row in df.iterrows():
    print(row)
    filename = f"{folder_path}/{row['id']:05d}.mp4"
    question = row["question"]
    output_object = solveqn(filename, question)
    print(output_object.content, row['id'])
    output_data.append([row['id'], output_object.content])

# Save output to CSV
output_df = pd.DataFrame(output_data, columns=["id", "output_content"])
output_df.to_csv("output_results.csv", index=False)


id                                                         42
question    Which traffic light is relevant to ego's curre...
Name: 41, dtype: object


100%|██████████| 5/5 [02:46<00:00, 33.27s/it]


B 42
id                                                         43
question    What type of building is the road sign indicat...
Name: 42, dtype: object


100%|██████████| 5/5 [03:17<00:00, 39.58s/it]


A 43
id                                                         44
question    Why is it appropriate for ego to remain stoppe...
Name: 43, dtype: object


100%|██████████| 5/5 [03:29<00:00, 41.88s/it]


A 44
id                                                         45
question    Which lane is blocked by construction? A. Left...
Name: 44, dtype: object


100%|██████████| 5/5 [04:26<00:00, 53.32s/it]


A 45
id                                                         46
question    What is closest to the average speed of the tr...
Name: 45, dtype: object


100%|██████████| 5/5 [05:01<00:00, 60.33s/it]


C 46
id                                                         47
question    Why is ego slowing down? A. pedestrian on cros...
Name: 46, dtype: object


100%|██████████| 5/5 [05:41<00:00, 68.21s/it]


D 47
id                                                         48
question    What can ego do next while the light is still ...
Name: 47, dtype: object


  0%|          | 0/5 [00:00<?, ?it/s]