In [None]:
print('Hello World')

In [3]:
pip install transformers torch torchvision openai

Collecting torchvision
  Downloading torchvision-0.21.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.1 kB)
Downloading torchvision-0.21.0-cp310-cp310-manylinux1_x86_64.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m29.6 kB/s[0m eta [36m0:00:00[0m00:06[0m00:18[0m
[?25hInstalling collected packages: torchvision
Successfully installed torchvision-0.21.0
Note: you may need to restart the kernel to use updated packages.


In [88]:
import torch
import torchvision.transforms as transforms
import clip
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import openai
import cv2
import os
import csv

MY_KEY = "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"

# OpenAI API Key (replace with your actual key)
client = openai.OpenAI(api_key=MY_KEY)

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Image Preprocessing Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        raise ValueError(f"Error: Video has fewer frames ({total_frames}) than requested ({num_frames})")
    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)  # Store PIL images instead of tensors
    
    cap.release()
    return frames  # Returns a list of PIL images

def detect_objects_with_clip(frame):
    """
    Uses CLIP to detect objects in multiple video frames.
    """
    detected_objects = []  # Store detected objects for each frame

    # Define relevant road objects
    object_classes = [
        "A pedestrian crossing the street",
        "A red traffic light",
        "A green traffic light",
        "A stop sign",
        "A yield sign",
        "Snow",
        "Mud",
        "Oil"
        "Railroads",
        "Airport sign",
        "A speed limit sign",
        "A one way sign pointing right"
        "A one way sign pointing left"
        "A do not enter sign"
        "A wrong way sign"
        "A cyclist on the road",
        "Traffic cone"
        "A parked vehicle",
        "A pedestrian waiting at the crosswalk",
        "A broken traffic light",
    ]

    # Tokenize all object classes once (to save processing time)
    text_inputs = clip.tokenize(object_classes).to(device)

    image_input = preprocess(frame).unsqueeze(0).to(device)  # Preprocess each frame

    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_inputs)

        # Compute similarity
        similarities = (image_features @ text_features.T).softmax(dim=-1)
        best_matches = similarities.topk(5)  # Get top 3 detected objects for each frame

    detected_objects = [object_classes[idx] for idx in best_matches.indices[0].tolist()]

    return detected_objects # Returns a list of object lists


def get_video_description(frame):
    """
    Generates a textual description from extracted video frames using BLIP-2.
    """
    if frame is None:
        return None  # Skip processing if frames couldn't be extracted

    inputs = processor(images=frame, return_tensors="pt").to(device)
    
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=50)
        video_description = processor.batch_decode(generated_ids, skip_special_tokens=True)

    
    return " ".join(video_description)

def generate_chatgpt_response(question, video_summary):
    """
    Uses the video description to query ChatGPT.
    """
    
    prompt = f"Given the information provided in each frame, give me the letter of your best answer to the following question with a one sentence justification: {question}\n\nVideo summary: {video_summary}"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful AI that answers questions based on detailed frame-by-frame video analysis with respect to a car-based ego."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content

# Example Usage

def process_inputs(video_list, question_list):
    """
    Processes videos **one by one** and pairs each video with its corresponding question.
    """
    results = {}
    num_videos = min(len(video_list), len(question_list))  # Prevent index errors

    for i in range(num_videos):
        video_path = video_list[i]
        question = question_list[i]

        if not os.path.exists(video_path):
            print(f"Skipping {video_path}: File not found")
            continue

        print(f"\nProcessing {i+1}/{num_videos}: {video_path}")
        print(f"Question: {question}")

        frames = extract_video_frames(video_path)
        frame_descriptions = []  # Store formatted descriptions for each frame

        if not frames:
            print(f"Skipping {video_path}: No frames extracted")
            continue

        for frame_idx, frame in enumerate(frames):
            detected_objects = detect_objects_with_clip(frame)
            video_description = get_video_description(frame)

            # Format output for this frame
            frame_info = f"Frame {frame_idx+1}:\n"
            frame_info += f" - Detected Objects: {', '.join(detected_objects) if detected_objects else 'None'}\n"
            frame_info += f" - Description: {video_description}\n"

            frame_descriptions.append(frame_info)

        # Convert frame descriptions into a single formatted text block
        formatted_frame_descriptions = "\n".join(frame_descriptions)
        print(formatted_frame_descriptions)

        # Send to ChatGPT for reasoning
        answer = generate_chatgpt_response(question, formatted_frame_descriptions)

        results[video_path] = {
            "question": question,
            "description": formatted_frame_descriptions,
            "answer": answer
        }

        print(f"Completed: {video_path}")
        print(f"Answer: {answer}")

    return results


# Load Videos
video_directory = "/home/ubuntu/TreeHacks2025/data/videos/videos"
video_files = sorted(
    [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith(".mp4")]
)

# Load Questions from CSV
question_file = "/home/ubuntu/TreeHacks2025/data/questions.csv"
questions = []

with open(question_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if it exists
    for row in reader:
        if row:  # Ensure row is not empty
            questions.append(row[1])  # Assuming questions are in the second column

# Run sequential processing
video_results = process_inputs(video_files, questions)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.91it/s]



Processing 1/50: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4
Question: Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.
Frame 1:
 - Detected Objects: A car merging into a lane, A parked vehicle, A construction barrier, An emergency vehicle, A pedestrian waiting at the crosswalk
 - Description: a car driving down a road with construction cones and traffic cones


Frame 2:
 - Detected Objects: A parked vehicle, A car merging into a lane, A construction barrier, An emergency vehicle, A pedestrian waiting at the crosswalk
 - Description: a car driving down a road with construction cones and a traffic light


Frame 3:
 - Detected Objects: A parked vehicle, An emergency vehicle, A car merging into a lane, A 

In [91]:
import torch
import torchvision.transforms as transforms
import clip
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import openai
import cv2
import os
import csv
import pandas as pd

MY_KEY = "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"

# OpenAI API Key (replace with your actual key)
client = openai.OpenAI(api_key=MY_KEY)

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Image Preprocessing Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        raise ValueError(f"Error: Video has fewer frames ({total_frames}) than requested ({num_frames})")
    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)  # Store PIL images instead of tensors
    
    cap.release()
    return frames  # Returns a list of PIL images

def detect_objects_with_clip(frame):
    """
    Uses CLIP to detect objects in multiple video frames.
    """
    detected_objects = []  # Store detected objects for each frame

    # Define relevant road objects
    object_classes = [
        "A pedestrian crossing the street",
        "A red traffic light",
        "A green traffic light",
        "A stop sign",
        "A yield sign",
        "Snow",
        "Mud",
        "Oil",
        "Railroads",
        "Airport sign",
        "A speed limit sign",
        "A right one way sign",
        "A left one way sign",
        "A do not enter sign",
        "A wrong way sign",
        "A cyclist on the road",
        "A bike lane",
        "Traffic cone",
        "A parked vehicle",
        "A pedestrian waiting at the crosswalk",
        "A broken traffic light",
    ]

    # Tokenize all object classes once (to save processing time)
    text_inputs = clip.tokenize(object_classes).to(device)

    image_input = preprocess(frame).unsqueeze(0).to(device)  # Preprocess each frame

    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_inputs)

        # Compute similarity
        similarities = (image_features @ text_features.T).softmax(dim=-1)
        best_matches = similarities.topk(5)  # Get top 3 detected objects for each frame

    detected_objects = [object_classes[idx] for idx in best_matches.indices[0].tolist()]

    return detected_objects # Returns a list of object lists


def get_video_description(frame):
    """
    Generates a textual description from extracted video frames using BLIP-2.
    """
    if frame is None:
        return None  # Skip processing if frames couldn't be extracted

    inputs = processor(images=frame, return_tensors="pt").to(device)
    
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=50)
        video_description = processor.batch_decode(generated_ids, skip_special_tokens=True)

    
    return " ".join(video_description)

def generate_chatgpt_response(question, video_summary):
    """
    Uses the video description to query ChatGPT.
    """
    
    prompt = f"Given the information provided in each frame, give me only the letter of your best answer to the following question: {question}\n\nVideo summary: {video_summary}"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful AI that answers questions based on detailed frame-by-frame video analysis with respect to a car-based ego."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content

def save_results_to_csv(results, output_csv="/mnt/data/output.csv"):
    """
    Saves processed video results to a CSV file using Pandas.

    :param results: List of tuples (video_id, answer).
    :param output_csv: Path for the output CSV file.
    :return: File path of the saved CSV file.
    """
    # Convert results to a Pandas DataFrame
    df = pd.DataFrame(results, columns=["id", "answer"])

    # Save to CSV
    df.to_csv(output_csv, index=False)  # `index=False` prevents adding an extra index column

    print(f"Results saved to {output_csv}")
    return output_csv  # Return the file path

# Example Usage

def process_inputs(video_list, question_list):
    """
    Processes videos **one by one** and pairs each video with its corresponding question.
    """
    results = []
    num_videos = min(len(video_list), len(question_list))  # Prevent index errors

    for i in range(num_videos):
        video_path = video_list[i]
        question = question_list[i]

        if not os.path.exists(video_path):
            print(f"Skipping {video_path}: File not found")
            continue

        print(f"\nProcessing {i+1}/{num_videos}: {video_path}")
        print(f"Question: {question}")

        frames = extract_video_frames(video_path)
        frame_descriptions = []  # Store formatted descriptions for each frame

        if not frames:
            print(f"Skipping {video_path}: No frames extracted")
            continue

        for frame_idx, frame in enumerate(frames):
            detected_objects = detect_objects_with_clip(frame)
            video_description = get_video_description(frame)

            # Format output for this frame
            frame_info = f"Frame {frame_idx+1}:\n"
            frame_info += f" - Detected Objects: {', '.join(detected_objects) if detected_objects else 'None'}\n"
            frame_info += f" - Description: {video_description}\n"

            frame_descriptions.append(frame_info)

        # Convert frame descriptions into a single formatted text block
        formatted_frame_descriptions = "\n".join(frame_descriptions)

        # Send to ChatGPT for reasoning
        answer = generate_chatgpt_response(question, formatted_frame_descriptions)
        answer = answer[0] if len(answer) >= 2 else answer

        # Extract filename without directory
        filename = os.path.basename(video_path)  # e.g., "video_0001.mp4"
        
        # Extract the numeric identifier (assumes format "video_0001.mp4")
        video_id = filename.split("_")[-1].split(".")[0]  # Extracts "0001"
        
        print((video_id, answer))
        results.append((video_id, answer))
    
    return results


# Load Videos
video_directory = "/home/ubuntu/TreeHacks2025/data/videos/videos"
video_files = sorted(
    [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith(".mp4")]
)

# Load Questions from CSV
question_file = "/home/ubuntu/TreeHacks2025/data/questions.csv"
questions = []

with open(question_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if it exists
    for row in reader:
        if row:  # Ensure row is not empty
            questions.append(row[1])  # Assuming questions are in the second column

# Run sequential processing
video_results = process_inputs(video_files, questions)

csv_file_path = save_results_to_csv(video_results)

# Show the file path
csv_file_path

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.60it/s]



Processing 1/50: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4
Question: Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.
('00001', 'B')

Processing 2/50: /home/ubuntu/TreeHacks2025/data/videos/videos/00002.mp4
Question: Where can ego legally park on this street? A. No parking anywhere. B. next to right curb. C. anywhere. D. next to left curb.
('00002', 'A')

Processing 3/50: /home/ubuntu/TreeHacks2025/data/videos/videos/00003.mp4
Question: What is the best description of the maneuver ego just did? A. Lane change to the left and then lane change to the right. B. Lane change to the right and then lane change to the left. C. Staying in a lane which curves to the left and then to the right. D. Staying in a 

[('00001', 'B'),
 ('00002', 'A'),
 ('00003', 'C'),
 ('00004', 'C'),
 ('00005', 'D'),
 ('00006', 'B'),
 ('00007', 'B'),
 ('00008', 'A'),
 ('00009', 'A'),
 ('00010', 'B'),
 ('00011', 'C'),
 ('00012', 'C'),
 ('00013', 'A'),
 ('00014', 'D'),
 ('00015', 'B'),
 ('00016', 'B'),
 ('00017', 'D'),
 ('00018', 'B'),
 ('00019', 'A'),
 ('00020', 'C'),
 ('00021', 'B'),
 ('00022', 'D'),
 ('00023', 'D'),
 ('00024', 'A'),
 ('00025', 'B'),
 ('00026', 'B'),
 ('00027', 'C'),
 ('00028', 'C'),
 ('00029', 'C'),
 ('00030', 'B'),
 ('00031', 'C'),
 ('00032', 'B'),
 ('00033', 'C'),
 ('00034', 'D'),
 ('00035', 'A'),
 ('00036', 'A'),
 ('00037', 'A'),
 ('00038', 'D'),
 ('00039', 'A'),
 ('00040', 'B'),
 ('00041', 'B'),
 ('00042', 'B'),
 ('00043', 'B'),
 ('00044', 'A'),
 ('00045', 'D'),
 ('00046', 'B'),
 ('00047', 'A'),
 ('00048', 'D'),
 ('00049', 'B'),
 ('00050', 'B')]

How It Works
1. Extracts 8 frames from a 5-second video.
2. Encodes frames using BLIP-2’s Vision Transformer (ViT).
3. Aggregates frame embeddings (mean pooling).
4. Sends embeddings + question to ChatGPT.
5. ChatGPT generates an answer based on the video context.

In [None]:
import torch
import torchvision.transforms as transforms
import clip
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import openai
import cv2
import os
import csv

class FrameAnalysis:
    """
    Stores detected objects and a textual description for a single video frame.
    """

    def __init__(self, detected_objects, description):
        """
        Initializes the FrameAnalysis object.

        :param detected_objects: List of objects detected in the frame.
        :param description: Textual description of the frame.
        """
        self.detected_objects = detected_objects  # List of detected objects (CLIP)
        self.description = description  # Frame description (BLIP-2)

    def __repr__(self):
        """
        Returns a string representation of the FrameAnalysis object.
        """
        return f"FrameAnalysis(Objects: {self.detected_objects}, Description: {self.description})"

MY_KEY = "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"

# OpenAI API Key (replace with your actual key)
client = openai.OpenAI(api_key=MY_KEY)

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Image Preprocessing Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        raise ValueError(f"Error: Video has fewer frames ({total_frames}) than requested ({num_frames})")
    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)  # Store PIL images instead of tensors
    
    cap.release()
    return frames  # Returns a list of PIL images

def detect_objects_with_clip(frame):
    """
    Uses CLIP to detect objects in multiple video frames.
    """
    detected_objects = []  # Store detected objects for each frame

    # Define relevant road objects
    object_classes = [
        "A pedestrian crossing the street",
        "A red traffic light",
        "A green traffic light",
        "A stop sign",
        "A yield sign",
        "Snow",
        "Airplane sign",
        "A car merging into a lane",
        "A construction barrier",
        "A one way sign",
        "An accident scene",
        "A speed limit sign",
        "A cyclist on the road",
        "A parked vehicle",
        "A pedestrian waiting at the crosswalk",
        "A broken traffic light",
    ]

    # Tokenize all object classes once (to save processing time)
    text_inputs = clip.tokenize(object_classes).to(device)

    image_input = preprocess(frame).unsqueeze(0).to(device)  # Preprocess each frame

    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        text_features = clip_model.encode_text(text_inputs)

        # Compute similarity
        similarities = (image_features @ text_features.T).softmax(dim=-1)
        best_matches = similarities.topk(3)  # Get top 3 detected objects for each frame

    detected_objects = [object_classes[idx] for idx in best_matches.indices[0].tolist()]

    return detected_objects # Returns a list of object lists


def get_video_description(frame):
    """
    Generates a textual description from extracted video frames using BLIP-2.
    """
    if frame is None:
        return None  # Skip processing if frames couldn't be extracted

    inputs = processor(images=frame, return_tensors="pt").to(device)
    
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=50)
        video_description = processor.batch_decode(generated_ids, skip_special_tokens=True)

    
    return " ".join(video_description)

def generate_chatgpt_response(question, video_summary, detected_objects):
    """
    Uses the video description to query ChatGPT.
    """
    
    prompt = f"""
    The following objects were detected in the car's driving scene using CLIP:
    {', '.join(detected_objects)}

    Additionally, BLIP-2 generated this description of the scene:
    "{blip_description}"

    Based on this information, describe what is happening in the scene. 
    Consider potential interactions, road risks, and the safest actions for a self-driving car.
    """

    prompt = f"Given the information provided in each frame, give me the letter of your best answer to the following question with a one sentence justification: {question}\n\nVideo summary: {video_summary}"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful AI that answers questions based on detailed frame-by-frame video analysis."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content

# Example Usage

def process_inputs(video_list, question_list):
    """
    Processes videos **one by one** and pairs each video with its corresponding question.
    """
    results = {}
    # Ensure we only loop through the shortest list length to prevent index errors
    num_videos = min(len(video_list), len(question_list))

    for i in range(num_videos):
        video_path = video_list[i]
        question = question_list[i]
        frame_discriptions = {}

        if not os.path.exists(video_path):
            print(f"Skipping {video_path}: File not found")
            continue

        print(f"\nProcessing {i+1}/{num_videos}: {video_path}")
        print(f"Question: {question}")

        frames = extract_video_frames(video_path)
       
        for frame in frames:
            detected_objects = detect_objects_with_clip(frame)
            video_description = get_video_description(frame)
            print(video_description)
            frame_data = (detected_objects, video_description)
            print(frame_data)
            frame_discriptions.append(frame_data)

        if video_description is None:
            print(f"Skipping {video_path}: No frames extracted")
            continue


        answer = generate_chatgpt_response(question, frame_discriptions)

        results[video_path] = {
            "question": question,
            "description": video_description,
            "answer": answer
        }

        print(f"Completed: {video_path}")
        print(f"Answer: {answer}")

    return results

# Load Videos
video_directory = "/home/ubuntu/TreeHacks2025/data/videos/videos"
video_files = sorted(
    [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith(".mp4")]
)

# Load Questions from CSV
question_file = "/home/ubuntu/TreeHacks2025/data/questions.csv"
questions = []

with open(question_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if it exists
    for row in reader:
        if row:  # Ensure row is not empty
            questions.append(row[1])  # Assuming questions are in the second column

# Run sequential processing
video_results = process_inputs(video_files, questions)

In [16]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import openai  # For ChatGPT API calls
import cv2
import os

# OpenAI API Key (replace with your own key)
openai.api_key = "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"

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")  # Or use 'blip2-flan-t5-xl' for T5-based
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

# Image Preprocessing Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        raise ValueError(f"Error: Video has fewer frames ({total_frames}) than requested ({num_frames})")
    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)  # Store PIL images instead of tensors
    
    cap.release()
    return frames  # Returns a list of PIL images

def get_video_description(video_path):
    """
    Extracts a single video embedding by processing multiple frames with BLIP-2's ViT.
    """
    frames = extract_video_frames(video_path)
    inputs = processor(images=frames, return_tensors="pt").to(device)
    with torch.no_grad():
        video_embeds = model.vision_model(**inputs).last_hidden_state  # Extract ViT embeddings
        video_embeds = video_embeds.mean(dim=1)  # Mean pooling across frames
    
    # Pass the visual embeddings to the text generation model (BLIP-2 Decoder) to generate text
    generated_ids = model.generate(video_embeds, max_length=50)  # Generate description with a max token limit
    video_description = processor.decode(generated_ids[0], skip_special_tokens=True)
    return video_description  # Return generated textual summary (description) of the video

def generate_chatgpt_response(question, video_embeds):
    """
    Uses video embeddings as context and queries ChatGPT.
    """
    video_context = video_embeds.cpu().numpy().tolist()  # Convert tensor to list for API
    prompt = f"Answer this question based on the given video embeddings:\n\n{question}"

    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI that answers video-related questions."},
            {"role": "user", "content": prompt},
            {"role": "user", "content": f"Video embedding: {video_context}"}
        ],
        max_tokens=100
    )
    return response["choices"][0]["message"]["content"]

# Example Usage
video_path = "/home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4"
if os.path.exists(video_path):
    print(f"File exists: {video_path}")
else:
    print(f"File does not exist: {video_path}")
question = "What is happening in this video?"

video_summary = get_video_description(video_path)
answer = generate_chatgpt_response(question, video_summary)

print(f"Q: {question}\nA: {answer}")


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.37it/s]


check
File exists: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4


ValueError: not enough values to unpack (expected 4, got 2)

In [62]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import openai
import cv2
import os
import csv



MY_KEY = "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"

# OpenAI API Key (replace with your actual key)
client = openai.OpenAI(api_key=MY_KEY)

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

# Image Preprocessing Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        raise ValueError(f"Error: Video has fewer frames ({total_frames}) than requested ({num_frames})")
    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)  # Store PIL images instead of tensors
    
    cap.release()
    return frames  # Returns a list of PIL images

def get_video_description(video_path):
    """
    Generates a textual description from extracted video frames using BLIP-2.
    """
    frames = extract_video_frames(video_path)
    if frames is None:
        return None  # Skip processing if frames couldn't be extracted

    inputs = processor(images=frames, return_tensors="pt").to(device)
    
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=50)
        video_description = processor.batch_decode(generated_ids, skip_special_tokens=True)

    
    return " ".join(video_description)

def generate_chatgpt_response(question, video_summary):
    """
    Uses the video description to query ChatGPT.
    """
    prompt = f"Given the information provided in each frame, give me the letter of your best answer to the following question with a one sentence justification: {question}\n\nVideo summary: {video_summary}"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful AI that answers questions based on detailed frame-by-frame video analysis."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content

# Example Usage

def process_inputs(video_list, question_list):
    """
    Processes videos **one by one** and pairs each video with its corresponding question.
    """
    results = {}

    # Ensure we only loop through the shortest list length to prevent index errors
    num_videos = min(len(video_list), len(question_list))

    for i in range(num_videos):
        video_path = video_list[i]
        question = question_list[i]

        if not os.path.exists(video_path):
            print(f"Skipping {video_path}: File not found")
            continue

        print(f"\nProcessing {i+1}/{num_videos}: {video_path}")
        print(f"Question: {question}")

        video_description = get_video_description(video_path)
        print(f"Video Description: {video_description}")

        if video_description is None:
            print(f"Skipping {video_path}: No frames extracted")
            continue

        answer = generate_chatgpt_response(question, video_description)

        results[video_path] = {
            "question": question,
            "description": video_description,
            "answer": answer
        }

        print(f"Completed: {video_path}")
        print(f"Answer: {answer}")

    return results

# Load Videos
video_directory = "/home/ubuntu/TreeHacks2025/data/videos/videos"
video_files = sorted(
    [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith(".mp4")]
)

# Load Questions from CSV
question_file = "/home/ubuntu/TreeHacks2025/data/questions.csv"
questions = []

with open(question_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if it exists
    for row in reader:
        if row:  # Ensure row is not empty
            questions.append(row[1])  # Assuming questions are in the second column

# Run sequential processing
video_results = process_inputs(video_files, questions)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.19it/s]



Processing 1/50: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4
Question: Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.
Video Description: a car driving down a road with construction cones and traffic cones
 a car driving down a road with construction cones and a traffic light
 a car driving down a road with construction cones
 a car driving down a street with construction cones and a traffic light
 a view of a road with traffic and construction cones
 a street with a lot of cars and cones
 a street with a lot of cars and cones
 a view of a busy street with cars and trucks

Completed: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4
Answer: B. It's illegal as the right turn lane is blocked by co

KeyboardInterrupt: 

In [83]:
import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import cv2
import os
import csv

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        print(f"Skipping {video_path}: Not enough frames ({total_frames})")
        return None

    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()
    frames = []

    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)
    
    cap.release()
    return frames if frames else None  # Returns a list of PIL images

def get_detailed_video_description(video_path):
    """
    Generates detailed descriptions for each frame by querying BLIP-2 in batch.
    """
    frames = extract_video_frames(video_path)
    if frames is None:
        return None

    frame_descriptions = []
    
    prompts = [
        "Question: What are the objects are in this image? Answer:",
        "Question: Where are cars driving? Answer:"
       # "Question: What are the shape of the signs? Answer:",
    ]

    for i, frame in enumerate(frames):
        # Create batch queries
        inputs = processor(
            images=[frame] * len(prompts),
            text=prompts,
            return_tensors="pt",
            padding=True  # Ensure correct token alignment
        ).to(device)


        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                temperature=1.0,
                repetition_penalty=1.2
            )

        # Decode all responses at once
        descriptions = processor.batch_decode(generated_ids, skip_special_tokens=True)
        print(descriptions)

        # Format output for this frame
        frame_info = f"Frame {i+1}:\n"
        for prompt, description in zip(prompts, descriptions):
            if description.strip() == "":
                description = "No meaningful response generated"  # Handle empty outputs
            frame_info += f" - {prompt}: {description}\n"

        frame_descriptions.append(frame_info)

    return "\n".join(frame_descriptions)  # Concatenate all frame details


def generate_chatgpt_response(question, detailed_description):
    """
    Uses the detailed frame descriptions to query ChatGPT for a more specific video answer.
    """
    prompt = f"The video contains the following scene details from multiple frames:\n{detailed_description}\n\nAnswer the following question about the video: {question}"

    response = client.chat.completions.create(
        model="",
        messages=[
            {"role": "system", "content": "You are a helpful AI that answers questions based on detailed frame-by-frame video analysis."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content

def process_inputs(video_list, question_list):
    """
    Processes videos one by one with improved frame-specific descriptions.
    """
    results = {}

    num_videos = min(len(video_list), len(question_list))

    for i in range(num_videos):
        video_path = video_list[i]
        question = question_list[i]

        if not os.path.exists(video_path):
            print(f"Skipping {video_path}: File not found")
            continue

        print(f"\nProcessing {i+1}/{num_videos}: {video_path}")
        print(f"Question: {question}")

        detailed_description = get_detailed_video_description(video_path)
        print(f"detailed description: {detailed_description}")

        if detailed_description is None:
            print(f"Skipping {video_path}: No frames extracted")
            continue

        answer = generate_chatgpt_response(question, detailed_description)

        results[video_path] = {
            "question": question,
            "detailed_description": detailed_description,
            "answer": answer
        }

        print(f"Completed: {video_path}")
        print(f"Video Details:\n{detailed_description}")
        print(f"Answer: {answer}")

    return results

# Load Videos
video_directory = "/home/ubuntu/TreeHacks2025/data/videos/videos"
video_files = sorted(
    [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith(".mp4")]
)

# Load Questions from CSV
question_file = "/home/ubuntu/TreeHacks2025/data/questions.csv"
questions = []

with open(question_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if it exists
    for row in reader:
        if row:  # Ensure row is not empty
            questions.append(row[1])  # Assuming questions are in the second column

# Run sequential processing
video_results = process_inputs(video_files, questions)

# Save results (Optional)
import json
with open("video_results.json", "w") as f:
    json.dump(video_results, f, indent=4)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.93it/s]



Processing 1/50: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4
Question: Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.
['Question: What are the objects are in this image? Answer: Traffic cones\n', 'Question: Where are cars driving? Answer:\n']
['Question: What are the objects are in this image? Answer: A car driving down a road\n', 'Question: Where are cars driving? Answer:\n']
['Question: What are the objects are in this image? Answer: Traffic cones\n', 'Question: Where are cars driving? Answer:\n']
['Question: What are the objects are in this image? Answer: Traffic cones\n', 'Question: Where are cars driving? Answer:\n']
['Question: What are the objects are in this image? Answer: Traffic cones\n', '

BadRequestError: Error code: 400 - {'error': {'message': 'you must provide a model parameter', 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [None]:
video_path = "/home/ubuntu/TreeHacks2025/data/videos/videos/00002.mp4"

if os.path.exists(video_path):
    print(f"File exists: {video_path}")
    question = "Where can ego legally park on this street? A. No parking anywhere. B. next to right curb. C. anywhere. D. next to left curb."
    
    # Get video description
    video_summary = get_video_description(video_path)
    print(f"Video Summary: {video_summary}")

    # Generate answer using ChatGPT
    answer = generate_chatgpt_response(question, video_summary)
    
    print(f"Q: {question}\nA: {answer}")
else:
    print(f"File does not exist: {video_path}")


In [25]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from openai import OpenAI
import cv2
import os
from collections import Counter

MY_KEY = "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"

# OpenAI API Key (replace with your actual key)
client = openai.OpenAI(api_key=MY_KEY)

# Load BLIP-2 Processor & Model
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)

# Image Preprocessing Pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_video_frames(video_path, num_frames=8):
    """
    Extracts evenly spaced frames from a video file.
    """
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames < num_frames:
        raise ValueError(f"Error: Video has fewer frames ({total_frames}) than requested ({num_frames})")
    frame_indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()

    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)  # Store PIL images instead of tensors
    
    cap.release()
    return frames  # Returns a list of PIL images

def get_detected_objects(video_path):
    """
    Uses BLIP-2 to generate a list of detected objects from video frames.
    """
    frames = extract_video_frames(video_path)
    
    detected_objects = Counter()
    
    for frame in frames:
        inputs = processor(images=frame, text="What objects are in the image?", return_tensors="pt").to(device)
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_length=50)
            description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        # Extract object names and count occurrences
        for obj in description.split(", "):
            detected_objects[obj] += 1
    
    return detected_objects

def generate_chatgpt_response(detected_objects):
    """
    Uses detected objects as context and queries ChatGPT-4o for a video description.
    """
    object_summary = ", ".join([f"{count} {obj}" for obj, count in detected_objects.items()])
    prompt = f"The video contains the following objects: {object_summary}. Describe what is happening in the video."
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful AI that describes video scenes based on detected objects."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content

# Example Usage
video_path = "/home/ubuntu/TreeHacks2025/data/videos/videos/00002.mp4"

if os.path.exists(video_path):
    print(f"File exists: {video_path}")
    
    # Detect objects in video frames
    detected_objects = get_detected_objects(video_path)
    print(f"Detected Objects: {dict(detected_objects)}")

    # Generate answer using ChatGPT
    video_description = generate_chatgpt_response(detected_objects)
    
    print(f"Video Description:\n{video_description}")
else:
    print(f"File does not exist: {video_path}")


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.78it/s]


File exists: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4
Detected Objects: {'What objects are in the image?\n': 8}
Video Description:
I'm afraid the information you've provided is a bit unclear. It mentions "8 What objects are in the image?" but doesn't specify what those objects are. If you can provide a list of the objects detected in the video, I can help you with a description of the scene and what's happening.


2. Fine-Tuning BLIP-2 for Video Question Answering
Objective: Teach BLIP-2 to process ego-centric video frames and answer driving-related questions.

In [None]:
{
  "video_id": "example_video.mp4",
  "frames": ["frame_1.jpg", "frame_2.jpg", ..., "frame_8.jpg"],  
  "question": "Is the car allowed to turn right at this intersection?",  
  "answer": "No, because there is a no-right-turn sign."  
}

In [None]:
import torch
import json
from transformers import Blip2Processor, Blip2ForConditionalGeneration, TrainingArguments, Trainer
from torch.utils.data import Dataset

# Load BLIP-2 Model & Processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(model_name).to(device)

# Load Your Dataset
class DrivingVideoQADataset(Dataset):
    def __init__(self, json_file, processor):
        with open(json_file, "r") as f:
            self.data = json.load(f)
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        frames = [Image.open(frame_path).convert("RGB") for frame_path in sample["frames"]]
        inputs = self.processor(images=frames, text=sample["question"], return_tensors="pt", padding=True)
        inputs["labels"] = self.processor.tokenizer(sample["answer"], return_tensors="pt")["input_ids"]
        return {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dim

# Initialize Dataset & DataLoader
dataset = DrivingVideoQADataset("driving_video_qa.json", processor)

# Training Configuration
training_args = TrainingArguments(
    output_dir="./blip2_driving",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
)

# Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)
trainer.train()