In [None]:
from google import genai
import pandas as pd
import random
import time
from IPython.display import Markdown
import time
from tqdm import trange
import cv2  
import base64
from openai import OpenAI

SLEEP_TIME = 0.5

GEMINI_API_KEY="***REMOVED***"
client_google = genai.Client(api_key=GEMINI_API_KEY)
client = OpenAI(api_key='***REMOVED***')

In [9]:

video_names = []
# 
for i in range(1, 51):
    video_names.append(f"tesla-real-world-video-q-a/videos/videos/{i:05d}.mp4")
    
print(video_names[-1])

tesla-real-world-video-q-a/videos/videos/00050.mp4


In [10]:
questions = pd.read_csv('tesla-real-world-video-q-a/questions.csv')
questions['answers'] = None
questions['freq_map'] = None
questions['entropy'] = None
questions.head()

Unnamed: 0,id,question,answers,freq_map,entropy
0,1,Was ego doing a legal maneuver if its goal is ...,,,
1,2,Where can ego legally park on this street? A. ...,,,
2,3,What is the best description of the maneuver e...,,,
3,4,Why is ego stopped? A. Judah. B. Traffic Light...,,,
4,5,"What is the blinker state of the oncoming car,...",,,


In [None]:
def extract_answer(text, depth = 5):
    if depth == 0:
        return "NOT FOUND"
    
    temp = client_google.models.generate_content(
        model='gemini-2.0-flash', contents=f'''Parse the correct letter choice from the text and include the final answer surrounded by <answer></answer> XML tags. 
        
        ****  ONLY INCLUDE A SINGLE LETTER WITHIN THE ANSWER TAGS ****
        
        Text to Parse:\n{text}
        '''
    ).text
    
    # print(temp)
    
    try:
        cur = temp.split('<answer>')[1].split('</answer>')[0].strip()
        assert(len(cur) == 1)
        assert(cur in ['A', 'B', 'C', 'D'])
        
        if depth != 5:
            print("BAD TEXT", text)
            print("PARSED", cur)
        
        return cur
    
    except Exception as e:
        time.sleep(SLEEP_TIME)
        print("Not found in", temp)
        return extract_answer(text, depth - 1)
    
    
print(extract_answer('''Here's the breakdown:

* **Ego's Goal:** Turn right.
* **The Lane:** Marked as a right-turn lane by the roundabout sign.

The correct answer is (B). The construction barrels clearly block the right-turn lane, making it illegal for ego to use it.  Ego should wait behind the other vehicles.'''))

B


In [13]:
def parse_question(question_str):
    Q = question_str.split('A.')[0]
    
    A_str = question_str.split(Q)[1]
    
    A_str += '&.'
    
    mp = {}
    
    temp = ['A', 'B', 'C', 'D', '&']
    
    for idx in range(len(temp)-1):
        cur_choice = temp[idx]
        nxt_choice = temp[idx+1]
        
        if nxt_choice + '.' not in A_str:
            break
        
        mp[cur_choice] = A_str.split(cur_choice + '.')[1].split(nxt_choice + '.')[0]
    
    return Q, mp

print(parse_question(questions['question'][0]))

('Was ego doing a legal maneuver if its goal is to turn right at the intersection? ', {'A': " It's legal as the lane is empty. ", 'B': " It's illegal as the right turn lane is bloacked by construction. ", 'C': " It's illegal as ego was cutting in other vehicles that were waiting. ", 'D': " It's legal but the lane ahead is way too narrow for ego to pass."})


In [16]:
def query_model(video_path, prompt):
    video = cv2.VideoCapture(video_path)
    
    base64Frames = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

    PROMPT_MESSAGES = [
        {
            "role": "user",
            "content": [
                prompt,
                *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]),
            ],
            'detail': 'high',
        },
    ]
    params = {
        "model": "gpt-4o",
        "messages": PROMPT_MESSAGES,
    }

    result = client.chat.completions.create(**params)
    return result.choices[0].message.content

response = query_model(video_names[0], '''Is it a legal maneuver to turn right at the intersection here?? 

Options:

(A) It's legal as the lane is empty. 
(B) It's illegal as the right turn lane is bloacked by construction. 
(C) It's illegal as ego was cutting in other vehicles that were waiting. 
(D) It's legal but the lane ahead is way too narrow for ego to pass.
''')
print(response)

(B) It's illegal as the right turn lane is blocked by construction.


In [17]:
import math

def calculate_entropy(freq_map):
    total = sum(freq_map.values())
    entropy = -sum((count / total) * math.log2(count / total) for count in freq_map.values() if count > 0)
    return entropy

# Example usage:
frequency_map = {'D': 1, 'A': 3, 'C': 1}
entropy_value = calculate_entropy(frequency_map)
print("Entropy:", entropy_value)

Entropy: 1.3709505944546687


In [None]:
import random

num_shuffles = 1

def answer_question(video_path, question, choices):
    """
    Shuffles answer options while keeping a mapping from fixed labels (A, B, C, D)
    to the original answer keys. Queries the model multiple times, extracts the answer
    with extract_answer(), and uses majority voting over the original options.
    
    Args:
        video_path (str): Path to the video file.
        question (str): The question to ask.
        choices (dict): Dictionary mapping original letters (e.g., 'A', 'B', 'C', 'D')
                        to option text.
                        
    Returns:
        tuple: (majority_vote, vote_counts) where:
            - majority_vote: The original option letter with the most votes.
            - vote_counts: A dictionary counting votes for each original option.
    """
    votes = {}
    fixed_labels = ['A', 'B', 'C', 'D']
    
    for _ in range(num_shuffles):
        # Shuffle the original choices (list of tuples: (original_letter, option_text))
        shuffled_choices = list(choices.items())
        random.shuffle(shuffled_choices)
        
        # Build a mapping from fixed label to the original letter
        mapping = {}  # new label -> original letter
        prompt = question + '\n\nOptions:\n\n'
        for new_label, (orig_letter, option_text) in zip(fixed_labels, shuffled_choices):
            mapping[new_label] = orig_letter
            prompt += f"{new_label}. {option_text}\n"
        
        # Query the model and extract its answer using extract_answer()
        res = query_model(video_path=video_path, prompt=prompt)
        answer_new_label = extract_answer(res)
        
        # Convert the fixed label answer to the original letter
        if answer_new_label in mapping:
            orig_answer = mapping[answer_new_label]
            votes[orig_answer] = votes.get(orig_answer, 0) + 1
        else:
            votes["Invalid"] = votes.get("Invalid", 0) + 1
            
        time.sleep(SLEEP_TIME)
    
    # Majority voting among valid (original) options
    valid_votes = {k: v for k, v in votes.items() if k in choices}
    if valid_votes:
        majority_vote = max(valid_votes, key=valid_votes.get)
    else:
        majority_vote = "Invalid"
    
    return majority_vote, votes

# Example usage:
print(video_names[0][1:])
print('=' * 10)
print(questions.iloc[0]['question'])
print('=' * 10)
Q, C = parse_question(questions.iloc[0]['question'])
print("Parsed Question:", Q)
print("Parsed Choices:", C)

majority_answer, vote_map = answer_question(video_path=video_names[0], question=Q, choices=C)
print("Majority Answer (original index):", majority_answer)
print("Vote Map (original indices):", vote_map)

esla-real-world-video-q-a/videos/videos/00001.mp4
Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.
Parsed Question: Was ego doing a legal maneuver if its goal is to turn right at the intersection? 
Parsed Choices: {'A': " It's legal as the lane is empty. ", 'B': " It's illegal as the right turn lane is bloacked by construction. ", 'C': " It's illegal as ego was cutting in other vehicles that were waiting. ", 'D': " It's legal but the lane ahead is way too narrow for ego to pass."}
Majority Answer (original index): B
Vote Map (original indices): {'B': 1}


In [None]:
for question_number in trange(0, 50):
    if questions.iloc[question_number]['answers'] is not None:
        continue
    
    raw_video_path = video_names[question_number]
    raw_question = questions.iloc[question_number]['question']
        
    Q, C = parse_question(raw_question)
        
    answered = False
    
    while not answered:
        try: 
            model_ans, freq_mp = answer_question(video_path=raw_video_path, question=Q, choices=C)
            
            questions.at[question_number, 'reasoning'] = model_ans
            questions.at[question_number, 'entropy'] = calculate_entropy(freq_mp)
            questions.at[question_number, 'freq_map'] = freq_mp
            
            answered = True
            
        except Exception as e:
            print(f"Error on question {question_number}: {e}")
        
        print("Rate Limit? Sleeping...")
        time.sleep(3)
        
    time.sleep(SLEEP_TIME)
    
    

  0%|          | 0/50 [00:00<?, ?it/s]

Rate Limit? Sleeping...


  2%|▏         | 1/50 [00:21<17:32, 21.48s/it]

Rate Limit? Sleeping...


  4%|▍         | 2/50 [00:41<16:41, 20.87s/it]

Rate Limit? Sleeping...


  6%|▌         | 3/50 [01:02<16:15, 20.76s/it]

Rate Limit? Sleeping...


  8%|▊         | 4/50 [01:23<15:53, 20.74s/it]

Not found in LEFT is not a valid multiple-choice answer. I need the letter choice. Since no letter choice is provided, I cannot give a valid answer.

Not found in left

Rate Limit? Sleeping...


 10%|█         | 5/50 [01:47<16:32, 22.06s/it]

Rate Limit? Sleeping...


 12%|█▏        | 6/50 [02:08<15:48, 21.56s/it]

Rate Limit? Sleeping...


 14%|█▍        | 7/50 [02:29<15:21, 21.43s/it]

Rate Limit? Sleeping...


 16%|█▌        | 8/50 [02:52<15:20, 21.91s/it]

Rate Limit? Sleeping...


 18%|█▊        | 9/50 [03:14<14:59, 21.95s/it]

Rate Limit? Sleeping...


 20%|██        | 10/50 [03:37<14:47, 22.18s/it]

Rate Limit? Sleeping...


 22%|██▏       | 11/50 [03:58<14:18, 22.01s/it]

Rate Limit? Sleeping...


 24%|██▍       | 12/50 [04:20<13:52, 21.92s/it]

Rate Limit? Sleeping...


 26%|██▌       | 13/50 [04:42<13:34, 22.01s/it]

Rate Limit? Sleeping...


 28%|██▊       | 14/50 [05:04<13:12, 22.02s/it]

Rate Limit? Sleeping...


 30%|███       | 15/50 [05:26<12:45, 21.87s/it]

Rate Limit? Sleeping...


 32%|███▏      | 16/50 [05:48<12:30, 22.07s/it]

Rate Limit? Sleeping...


 34%|███▍      | 17/50 [06:10<12:01, 21.86s/it]

Rate Limit? Sleeping...


 36%|███▌      | 18/50 [06:32<11:41, 21.94s/it]

Rate Limit? Sleeping...


 38%|███▊      | 19/50 [06:53<11:09, 21.60s/it]

Not found in A

Not found in A

Not found in A

Not found in A

Not found in A

Rate Limit? Sleeping...


 40%|████      | 20/50 [07:20<11:41, 23.40s/it]

Rate Limit? Sleeping...


 42%|████▏     | 21/50 [07:41<11:00, 22.78s/it]

Not found in I cannot provide a letter choice as there was no question asked, only a statement. Therefore, I cannot parse an answer.

Rate Limit? Sleeping...


 44%|████▍     | 22/50 [08:06<10:53, 23.35s/it]

Not found in Due to the inability to determine a correct answer, I am unable to provide a letter choice.

Rate Limit? Sleeping...


 46%|████▌     | 23/50 [08:28<10:20, 22.97s/it]

Not found in a) yes
b) no

<answer>a</answer>
Not found in a) Yes
        b) No
        
        <answer>a</answer>
Not found in a) True
        b) False
        
        <answer>a</answer>
Rate Limit? Sleeping...


 48%|████▊     | 24/50 [08:54<10:22, 23.94s/it]

Rate Limit? Sleeping...


 50%|█████     | 25/50 [09:16<09:38, 23.14s/it]

Rate Limit? Sleeping...


 52%|█████▏    | 26/50 [09:38<09:10, 22.95s/it]

Rate Limit? Sleeping...


 54%|█████▍    | 27/50 [09:59<08:34, 22.37s/it]

Rate Limit? Sleeping...


 56%|█████▌    | 28/50 [10:20<08:02, 21.95s/it]

Rate Limit? Sleeping...


 58%|█████▊    | 29/50 [10:41<07:36, 21.74s/it]

Not found in Based on the images, the most likely course of action for the vehicle in front of ego is:

D. Go straight.

There are no visible indications or signals suggesting a turn, and the vehicle is positioned in the lane typically used for continuing straight.

Rate Limit? Sleeping...


 60%|██████    | 30/50 [11:06<07:29, 22.48s/it]

Rate Limit? Sleeping...


 62%|██████▏   | 31/50 [11:27<07:01, 22.18s/it]

Rate Limit? Sleeping...


 64%|██████▍   | 32/50 [11:49<06:37, 22.06s/it]

Rate Limit? Sleeping...


 66%|██████▌   | 33/50 [12:10<06:09, 21.76s/it]

Rate Limit? Sleeping...


 68%|██████▊   | 34/50 [12:32<05:50, 21.90s/it]

Not found in  The answer cannot be derived from the context.
        
         

Not found in  Given the information, there is no question to answer or multiple choices provided. Therefore, I cannot select a correct letter choice.
        
         If you provide the question with the multiple choices I can select the correct one.
        
Not found in  The answer to this question cannot be determined from the context provided.
        
        

Rate Limit? Sleeping...


 70%|███████   | 35/50 [13:12<06:48, 27.21s/it]

Rate Limit? Sleeping...


 72%|███████▏  | 36/50 [13:33<05:55, 25.36s/it]

Rate Limit? Sleeping...


 74%|███████▍  | 37/50 [13:56<05:20, 24.62s/it]

Rate Limit? Sleeping...


 76%|███████▌  | 38/50 [14:17<04:44, 23.69s/it]

Rate Limit? Sleeping...


 78%|███████▊  | 39/50 [14:40<04:16, 23.36s/it]

Rate Limit? Sleeping...


 80%|████████  | 40/50 [15:02<03:50, 23.03s/it]

Rate Limit? Sleeping...


 82%|████████▏ | 41/50 [15:27<03:31, 23.48s/it]

Rate Limit? Sleeping...


 84%|████████▍ | 42/50 [15:47<02:59, 22.42s/it]

Not found in  Airport is not a letter choice. The answer should be one of the options from A-E. There are no options listed. I am unable to provide an answer.

Rate Limit? Sleeping...


 86%|████████▌ | 43/50 [16:11<02:40, 22.98s/it]

Not found in Given the information, it's impossible to determine the correct answer.
        
        <answer>I</answer>
Not found in I cannot provide an answer without any options to choose from.

Not found in I cannot provide a valid answer as no options were provided in the prompt.
        
        
Not found in Given the lack of information, I cannot choose a correct answer.
        
        

Not found in I cannot provide an answer without the multiple choice options.

Rate Limit? Sleeping...


 88%|████████▊ | 44/50 [16:42<02:32, 25.34s/it]

Not found in A

Rate Limit? Sleeping...


 90%|█████████ | 45/50 [17:04<02:02, 24.43s/it]

Not found in I cannot provide an answer because the prompt does not contain a multiple-choice question with answer options.

Not found in I cannot provide an answer because there are no lettered choices in the prompt.

Not found in I am unable to provide an answer as there are no multiple choice options provided in the prompt.

Not found in I am unable to provide an answer because the prompt does not contain a question with answer choices.

Not found in I cannot provide an answer because no answer choices were given in the prompt.

Rate Limit? Sleeping...


 92%|█████████▏| 46/50 [17:34<01:44, 26.13s/it]

Rate Limit? Sleeping...


 94%|█████████▍| 47/50 [17:56<01:14, 24.92s/it]

Rate Limit? Sleeping...


 96%|█████████▌| 48/50 [18:18<00:47, 23.91s/it]

Not found in This statement does not provide a multiple-choice question or options to choose from. Therefore, I cannot parse a correct letter choice.

Not found in This statement does not provide enough information to answer a question. Therefore, I cannot provide a letter choice.

Not found in This statement is a simple observation and does not require choosing from multiple options. Therefore, I cannot provide a letter choice.

Not found in This statement is a simple observation and doesn't require a choice. Therefore, I cannot provide a letter choice.

Not found in This statement does not provide a question or choices to parse. Therefore, I cannot provide a letter choice as an answer.

Rate Limit? Sleeping...


 98%|█████████▊| 49/50 [18:46<00:25, 25.31s/it]

Rate Limit? Sleeping...


100%|██████████| 50/50 [19:10<00:00, 23.00s/it]


In [25]:
print(questions.head())

   id                                           question answers  freq_map  \
0   1  Was ego doing a legal maneuver if its goal is ...    None  {'B': 1}   
1   2  Where can ego legally park on this street? A. ...    None  {'B': 1}   
2   3  What is the best description of the maneuver e...    None  {'D': 1}   
3   4  Why is ego stopped? A. Judah. B. Traffic Light...    None  {'B': 1}   
4   5  What is the blinker state of the oncoming car,...    None  {'A': 1}   

  entropy reasoning  
0    -0.0         B  
1    -0.0         B  
2    -0.0         D  
3    -0.0         B  
4    -0.0         A  


# Parse Questions

In [31]:
letter_answers = []
for i in questions['reasoning']:
    letter_answers.append(i)
    
print(letter_answers)

temp = []
for i in letter_answers:
    if i not in ['A', 'B', 'C', 'D']:
        temp.append('C')
    else:
        temp.append(i)
        
letter_answers = temp
print(letter_answers)

['B', 'B', 'D', 'B', 'A', 'D', 'A', 'A', 'B', 'A', 'D', 'B', 'C', 'C', 'B', 'B', 'D', 'C', 'D', 'Invalid', 'B', 'B', 'B', 'A', 'A', 'C', 'A', 'A', 'B', 'B', 'C', 'B', 'C', 'D', 'D', 'A', 'D', 'B', 'B', 'B', 'D', 'B', 'A', 'Invalid', 'C', 'Invalid', 'D', 'D', 'Invalid', 'D']
['B', 'B', 'D', 'B', 'A', 'D', 'A', 'A', 'B', 'A', 'D', 'B', 'C', 'C', 'B', 'B', 'D', 'C', 'D', 'C', 'B', 'B', 'B', 'A', 'A', 'C', 'A', 'A', 'B', 'B', 'C', 'B', 'C', 'D', 'D', 'A', 'D', 'B', 'B', 'B', 'D', 'B', 'A', 'C', 'C', 'C', 'D', 'D', 'C', 'D']


In [32]:
dataframe_A, dataframe_B = [], []

idx = 1
while idx <= 251:
    dataframe_A.append(idx)
    
    if idx-1 < len(letter_answers):
        dataframe_B.append(letter_answers[idx-1])
    else:
        dataframe_B.append('A')
    idx += 1

In [33]:
answers = pd.DataFrame({'id': dataframe_A, 'answer': dataframe_B})


In [34]:
answers.to_csv('baseline_answers.csv', index=False)
answers.head()

Unnamed: 0,id,answer
0,1,B
1,2,B
2,3,D
3,4,B
4,5,A
