In [14]:
import time
from inference_sdk import InferenceHTTPClient
from my_vars import ROBO, MY_KEY

# Roboflow API Setup
API_KEY = ROBO  # Load API Key securely
MODEL_ID = "lisa-bjgh5/2"  # Your YOLOv8 LISA Model ID
CLIENT = InferenceHTTPClient(api_url="https://detect.roboflow.com", api_key=API_KEY)

def detect_objects_in_frame(frame):
    """Detect objects in a single frame using the API."""
    try:
        result = CLIENT.infer(frame, model_id=MODEL_ID)  # Send frame directly, no need to save it as an image
        frame_detections = []

        for pred in result.get("predictions", []):
            frame_detections.append({
                "label": pred["class"],
                "confidence": pred["confidence"],
                "bbox": {
                    "x": int(pred["x"]),
                    "y": int(pred["y"]),
                    "width": int(pred["width"]),
                    "height": int(pred["height"])
                }
            })

        return frame_detections  # List of detected objects

    except Exception as e:
        print(f"Error during object detection: {e}")
        return []


def process_video_frames(video_path):
    """Extracts three key frames from a video and runs object detection."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = [0, total_frames // 2, total_frames - 1]  # Beginning, middle, end
    detections = []

    for frame_idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if not ret:
            print(f"Error: Could not read frame {frame_idx}")
            continue

        frame_detections = detect_objects_in_frame(frame, frame_idx)
        if frame_detections:
            detections.append(frame_detections)

        time.sleep(0.2)  # Avoid API rate limits

    cap.release()
    return detections

# Example usage
#detections = process_video_frames(VIDEO1)
#print(detections)

print("✅ Inference complete! Detection results returned.")

✅ Inference complete! Detection results returned.


In [18]:
import torch
import clip
import cv2
import os
import csv
import numpy as np
import pandas as pd
import concurrent.futures
import openai
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from ultralytics import YOLO


# OpenAI API Key
MY_KEY = "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"
client = openai.OpenAI(api_key=MY_KEY)

# Load Models
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
clip_model, preprocess = clip.load("ViT-B/32", device=device)
yolo_model = YOLO("yolov8n.pt")  # Using YOLOv8 for car detection

# Thresholds
MIN_SHIFT_THRESHOLD = 20  
FORWARD_THRESHOLD = 1.1  
UNKNOWN_THRESHOLD = 10   

def extract_video_frames(video_path, num_frames=8):
    """Extracts 8 evenly spaced frames from the video."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_center = frame_width // 2

    frame_indices = np.linspace(0, total_frames - 1, num_frames).astype(int).tolist()
    prev_frame_data = None
    movement_summary = []
    frames = []
    detections = []

    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            print(f"Error: Could not read frame {idx}")
            continue

        # Convert to PIL format for CLIP & BLIP-2
        pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        frames.append(pil_frame)

        frame_detections = detect_objects_in_frame(frame)
        if frame_detections:
            detections.append(frame_detections)
        # Run YOLO on this frame
        curr_frame_data = detect_cars_in_frame(frame, idx, frame_center)

        # Analyze movement between frames
        if prev_frame_data is not None:
            movement, car_info = analyze_camera_movement(prev_frame_data, curr_frame_data)
            movement_summary.append((idx, movement, car_info))

        prev_frame_data = curr_frame_data

    cap.release()
    return frames, movement_summary, detections

def get_video_descriptions(frames):
    """Generates textual descriptions for frames using BLIP-2."""
    if not frames:
        return ["No description available"] * len(frames)

    inputs = processor(images=frames, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=50)
        descriptions = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return descriptions


def detect_cars_in_frame(frame, frame_num, frame_center):
    """Detect cars in a frame and categorize their positions."""
    results = yolo_model(frame)
    car_positions = {"left": [], "right": [], "front": []}

    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            label = result.names[int(box.cls[0])]

            if label == "car":
                box_center_x = (x1 + x2) // 2
                area = (x2 - x1) * (y2 - y1)  

                if box_center_x < frame_center * 0.75:
                    car_positions["left"].append((box_center_x, area))
                elif box_center_x > frame_center * 1.25:
                    car_positions["right"].append((box_center_x, area))
                else:
                    car_positions["front"].append((box_center_x, area))

    return car_positions

def analyze_camera_movement(prev_data, curr_data):
    """Analyzes camera movement using object tracking."""
    if not prev_data or not curr_data:
        return "Unknown", {}

    movement = "Unknown"
    # Get average X positions of objects in each region
    avg_prev_left = np.mean([x for x, _ in prev_data["left"]]) if prev_data["left"] else None
    avg_curr_left = np.mean([x for x, _ in curr_data["left"]]) if curr_data["left"] else None

    avg_prev_right = np.mean([x for x, _ in prev_data["right"]]) if prev_data["right"] else None
    avg_curr_right = np.mean([x for x, _ in curr_data["right"]]) if curr_data["right"] else None

    avg_prev_center = np.mean([x for x, _ in prev_data["front"]]) if prev_data["front"] else None
    avg_curr_center = np.mean([x for x, _ in curr_data["front"]]) if curr_data["front"] else None

    # Left/right movement detection
    if avg_prev_left and avg_curr_left and avg_curr_left > avg_prev_left + MIN_SHIFT_THRESHOLD:
        movement = "Moving Right"
    elif avg_prev_right and avg_curr_right and avg_curr_right < avg_prev_right - MIN_SHIFT_THRESHOLD:
        movement = "Moving Left"

    # Forward detection (More lenient)
    prev_avg_area = np.mean([area for _, area in prev_data["front"]]) if prev_data["front"] else None
    curr_avg_area = np.mean([area for _, area in curr_data["front"]]) if curr_data["front"] else None

    if prev_avg_area and curr_avg_area:
        if curr_avg_area > prev_avg_area * FORWARD_THRESHOLD:  # Only 10% increase to detect forward
            movement = "Moving Forward"

    # Strengthen "Unknown" detection
    if movement in ["Moving Left", "Moving Right", "Moving Forward"]:
        if avg_curr_center is None or avg_prev_center is None or abs(avg_curr_center - avg_prev_center) < UNKNOWN_THRESHOLD:
            movement = "Unknown"

    # Distance estimation
    car_summary = {
        "Left": {"Close": sum(1 for _, area in curr_data["left"] if area > 50000),
                 "Far": sum(1 for _, area in curr_data["left"] if area <= 50000)},
        "Right": {"Close": sum(1 for _, area in curr_data["right"] if area > 50000),
                  "Far": sum(1 for _, area in curr_data["right"] if area <= 50000)},
        "Front": {"Close": sum(1 for _, area in curr_data["front"] if area > 50000),
                  "Far": sum(1 for _, area in curr_data["front"] if area <= 50000)}
    }

    return movement, car_summary

def detect_objects_with_clip(frames):
    """Batch process frames using CLIP for object detection."""
    object_classes = [
        "A pedestrian crossing the street", "A red traffic light", "A green traffic light",
        "A stop sign", "A yield sign", "Snow", "Mud", "Oil", "Railroads", "Airport sign",
        "A speed limit sign", "A one way sign pointing right", "A one way sign pointing left",
        "A do not enter sign", "A wrong way sign", "A cyclist on the road", "Traffic cone",
        "A parked vehicle", "A pedestrian waiting at the crosswalk", "A broken traffic light"
    ]

    # Tokenize text once
    text_inputs = clip.tokenize(object_classes).to(device)

    # Process all frames in batch
    image_inputs = torch.stack([preprocess(frame) for frame in frames]).to(device)

    with torch.no_grad():
        # Encode image and text features
        image_features = clip_model.encode_image(image_inputs)
        text_features = clip_model.encode_text(text_inputs)

        # Normalize features for cosine similarity
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Compute similarity using cosine similarity (instead of softmax)
        similarities = image_features @ text_features.T

    # Extract the top 5 detected objects per frame
    detected_objects = [
        [object_classes[idx] for idx in similarities[i].topk(5).indices.tolist()]
        for i in range(len(frames))
    ]
    
    return detected_objects


def generate_chatgpt_responses(questions, video_summaries):
    """
    Uses GPT-4 Turbo to generate responses in parallel.
    """
    prompts = [
        f"Given the information provided in each frame, give me the letter of your best answer to the following question with a one-sentence justification: {question}\n\nVideo summary: {summary}"
        for question, summary in zip(questions, video_summaries)
    ]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        responses = list(executor.map(chatgpt_request, prompts))

    return responses

def chatgpt_request(prompt):
    """
    Sends a single request to GPT-4 Turbo.
    """
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful AI that analyzes driving videos for a self-driving car."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    return response.choices[0].message.content


def process_inputs(video_list, question_list):
    """Processes videos sequentially while ensuring synchronization between YOLO (one-by-one) and batch processing (BLIP-2 & CLIP)."""
    results = []

    for i in range(len(video_list)):
        video_path = video_list[i]
        question = question_list[i]

        if not os.path.exists(video_path):
            print(f"Skipping {video_path}: File not found")
            continue

        print(f"\nProcessing {i+1}/{len(video_list)}: {video_path}")
        print(f"Question: {question}")

        # Step 1: Extract Frames (All models will use these)
        frames, movement_data, detections = extract_video_frames(video_path)

        # Step 2: Batch Process CLIP and BLIP-2
        detected_objects = detect_objects_with_clip(frames)  # CLIP runs in batch
        video_descriptions = get_video_descriptions(frames)  # BLIP-2 runs in batch

        # Step 3: Process YOLO One-by-One (Per Frame)
        frame_descriptions = []
        for idx, frame in enumerate(frames):
            _, movement, car_summary = movement_data[idx] if idx < len(movement_data) else (idx, "Unknown", {})
            frame_detections = detections[idx] if idx < len(detections) else []

            # Ensure YOLO results align with frame order
            frame_info = (
                f"Frame {idx+1}: \n"
                f" - Detected Objects (CLIP): {', '.join(detected_objects[idx]) if detected_objects[idx] else 'None'}\n"
                f" - Description (BLIP-2): {video_descriptions[idx]}\n"
                f" - LISA Detections: {', '.join(frame_detections) if frame_detections else 'None'}\n"
                f" - Camera Movement: {movement}\n"
                f" - Car Summary: {car_summary}\n"
            )
            frame_descriptions.append(frame_info)

        # Step 4: Format All Frame Data for ChatGPT
        formatted_description = "\n".join(frame_descriptions)
        print("------------------------------")
        print(formatted_description)
        print("------------------------------")
        answer = generate_chatgpt_responses([question], [formatted_description])[0]
        print(answer)

        video_id = os.path.basename(video_path).split("_")[-1].split(".")[0]
        results.append((video_id, answer))

    return results


def save_results_to_csv(results, output_csv="/mnt/data/output.csv"):
    df = pd.DataFrame(results, columns=["id", "answer"])
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")
    return output_csv

# Load video & question files
video_files = sorted([os.path.join("/home/ubuntu/TreeHacks2025/data/videos/videos", f) for f in os.listdir("/home/ubuntu/TreeHacks2025/data/videos/videos") if f.endswith(".mp4")])
questions = [row[1] for row in csv.reader(open("/home/ubuntu/TreeHacks2025/data/questions.csv", encoding="utf-8"))][1:]

video_results = process_inputs(video_files, questions)
csv_file = save_results_to_csv(video_results)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.30it/s]



Processing 1/50: /home/ubuntu/TreeHacks2025/data/videos/videos/00001.mp4
Question: Was ego doing a legal maneuver if its goal is to turn right at the intersection? A. It's legal as the lane is empty. B. It's illegal as the right turn lane is bloacked by construction. C. It's illegal as ego was cutting in other vehicles that were waiting. D. It's legal but the lane ahead is way too narrow for ego to pass.

0: 416x640 4 cars, 3.4ms
Speed: 1.8ms preprocess, 3.4ms inference, 0.8ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 3 cars, 3.7ms
Speed: 1.9ms preprocess, 3.7ms inference, 0.9ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 5 cars, 3.9ms
Speed: 2.0ms preprocess, 3.9ms inference, 0.9ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 8 cars, 3.9ms
Speed: 1.9ms preprocess, 3.9ms inference, 0.9ms postprocess per image at shape (1, 3, 416, 640)

0: 416x640 7 cars, 3.9ms
Speed: 1.9ms preprocess, 3.9ms inference, 0.9ms postprocess per image at shap

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-nXjAAEsFlOj5vQcul8hP8U5F on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}