In [None]:
pip install torch install torchvision transformers timm fairscale imageio 'imageio[ffmpeg]' opencv-python

Collecting torch
  Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.2.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl (66.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fi

In [1]:
import cv2
import imageio

def extract_frames(video_path, frame_rate=1):
    cap = cv2.VideoCapture(video_path)
    frame_list = []
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)

    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_list.append(frame_rgb)
        frame_count += 1
    cap.release()
    return frame_list

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
video_path = '/content/drive/MyDrive/tesla/tesla-real-world-video-q-a/videos/videos/00001.mp4'
frames = extract_frames(video_path, frame_rate=10)
print(f"Extracted {len(frames)} frames.")

Extracted 60 frames.


In [None]:
caption_frames = extract_frames(video_path, frame_rate=1)
print(f"Extracted {len(caption_frames)} frames.")

Extracted 5 frames.


In [23]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image

# Load VideoBLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

def describe_frames(frames):
    descriptions = []
    for frame in frames:
        image = Image.fromarray(frame)
        inputs = processor(images=image, return_tensors="pt").to("cuda")
        description = model.generate(**inputs, max_length=50)
        text = processor.batch_decode(description, skip_special_tokens=True)[0]
        descriptions.append(text)
    return descriptions

descriptions = describe_frames(frames)

# Combine descriptions into a summary
video_description = " ".join(descriptions)
print("Detailed Video Description:", video_description)


Detailed Video Description: cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car drivin

In [24]:
video_description

'cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides cars are driving down a road with construction cones on both sides there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange cones there is a white car driving down the road with orange

In [None]:
api = 'API'

import openai

# Set up OpenAI client (replace with your API key)
client = openai.OpenAI(api_key=api)

# Function to ask OpenAI a question
def ask_openai(question, model="gpt-4o"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": question}]
    )
    return response.choices[0].message.content

In [26]:
clean_question = f'can you help to clean this description, get rid of the repeat sentence. make it consice? {video_description}'
clean_answer = ask_openai(clean_question)
print("AI:", clean_answer)

AI: Cars are driving down a road lined with construction cones, and a white car is among them. Many cars are parked on the side near a construction site.


In [27]:
import pandas as pd

tesla_questions = pd.read_csv('/content/drive/MyDrive/tesla/tesla-real-world-video-q-a/questions.csv')
tesla_questions = tesla_questions.drop(columns = 'id')

In [28]:
tesla_questions['question'][0]

"Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass."

In [30]:
tesla_question = tesla_questions['question'][0]

In [31]:
# Example usage
question = f"Given the following description i got from a videoblip processor: '{clean_answer}' based on the scenario illustrated in the above text, consider the following question and pick the best ansewer from the choices: '{tesla_question}'"
answer = ask_openai(question)
print("AI:", answer)

AI: Based on the scenario described, the best answer would be: B. It's illegal as the right turn lane is blocked by construction.


---

In [None]:
# integrated
import torch
from PIL import Image
import pandas as pd
import openai
from transformers import BlipProcessor, BlipForConditionalGeneration
import cv2
import os
import re

class VideoQAProcessor:
    def __init__(self, api_key, questions_path):
        # Initialize OpenAI
        self.client = openai.OpenAI(api_key=api_key)

        # Initialize BLIP
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")

        # Load questions
        self.questions_df = pd.read_csv(questions_path)
        self.questions_df = self.questions_df.drop(columns='id')

    def extract_frames(self, video_path, frame_rate=5):
        cap = cv2.VideoCapture(video_path)
        frame_list = []
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_interval = int(fps / frame_rate)

        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count % frame_interval == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame_list.append(frame_rgb)
            frame_count += 1
        cap.release()
        return frame_list

    def describe_frames(self, frames):
        descriptions = []
        for frame in frames:
            image = Image.fromarray(frame)
            inputs = self.processor(images=image, return_tensors="pt").to("cuda")
            description = self.model.generate(**inputs, max_length=50)
            text = self.processor.batch_decode(description, skip_special_tokens=True)[0]
            descriptions.append(text)
        return " ".join(descriptions)

    def clean_description(self, description):
        question = f"Can you help to clean this description, get rid of the repeat sentences and make it concise? {description}"
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": question}]
        )
        return response.choices[0].message.content

    def get_answer(self, description, question):
        prompt = f"Given the following description from a video: '{description}' based on the scenario illustrated in the above text, consider the following question and pick the best answer from the choices. Just return the single letter choice (A, B, C, D, or E): '{question}'"
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()

    def clean_answer(self, answer):
        # Extract just the letter answer using regex
        match = re.search(r'[A-E]', answer)
        return match.group(0) if match else answer

    def process_video(self, video_path):
        # Extract frames
        frames = self.extract_frames(video_path)

        # Get video description
        raw_description = self.describe_frames(frames)
        clean_description = self.clean_description(raw_description)
        return clean_description

    def input_to_chat(self, clean_description, video_id)
        # Get answer for the corresponding question
        question = self.questions_df.loc[int(video_id) - 1, 'question']
        answer = self.get_answer(clean_description, question)
        clean_answer = self.clean_answer(answer)

        return clean_answer

def process_all_videos(videos_dir, output_path, api_key, questions_path):
    # Initialize processor
    processor = VideoQAProcessor(api_key, questions_path)

    # Initialize results list
    results = []

    # Process each video
    sample = 51
    test = 252
    for i in range(1, sample):  # Process videos 00001 to 00050
        video_id = f"{i:05d}"  # Format to 5 digits with leading zeros
        video_path = os.path.join(videos_dir, f"{video_id}.mp4")

        if os.path.exists(video_path):
            print(f"Processing video {video_id}...")
            try:
                clean_description = processor.process_video(video_path)
                # answer = processor.input_to_chat(clean_description, video_id)
                # results.append({'id': video_id, 'answer': answer})
                results.append({'id': video_id, 'clean_description': clean_description})
                print(f"Video {video_id}: Answer = {answer}")
            except Exception as e:
                print(f"Error processing video {video_id}: {str(e)}")

    # Create and save DataFrame
    df = pd.DataFrame(results)
    # df.to_csv(output_path, index=False)

    return df

# Example usage
api_key = 'API'
videos_dir = "/content/drive/MyDrive/tesla/tesla-real-world-video-q-a/videos/videos"
questions_path = "/content/drive/MyDrive/tesla/tesla-real-world-video-q-a/questions.csv"
# output_path = "tesla_answers.csv"
output_path = "clean_descriptions.csv"


df = process_all_videos(videos_dir, output_path, api_key, questions_path)
print(df.to_string(index=False))

Processing video 00001...
Video 00001: Answer = B.
Processing video 00002...
Video 00002: Answer = A.
Processing video 00003...
Video 00003: Answer = D
Processing video 00004...
Video 00004: Answer = B. Traffic Light.
Processing video 00005...
Video 00005: Answer = D
Processing video 00006...
Video 00006: Answer = D.
Processing video 00007...
Video 00007: Answer = A
Processing video 00008...
Video 00008: Answer = A
Processing video 00009...
Video 00009: Answer = A.
Processing video 00010...
Video 00010: Answer = A. 2.
Processing video 00011...
Video 00011: Answer = C
Processing video 00012...
Video 00012: Answer = C
Processing video 00013...
Video 00013: Answer = E
Processing video 00014...
Video 00014: Answer = D
Processing video 00015...
Video 00015: Answer = B
Processing video 00016...
Video 00016: Answer = D
Processing video 00017...
Video 00017: Answer = D. Snow.
Processing video 00018...
Video 00018: Answer = D
Processing video 00019...
Video 00019: Answer = B. 5 and the right mo

In [37]:
def clean_answer(answer):
    # Extract just the letter answer using regex
    import re
    match = re.search(r'[A-E]', answer)
    return match.group(0) if match else answer

# Apply to your DataFrame
df['answer'] = df['answer'].apply(clean_answer)

---