In [46]:
import time
from inference_sdk import InferenceHTTPClient
from my_vars import ROBO

# Roboflow API Setup
API_KEY = ROBO  # Load API Key securely
MODEL_ID = "lisa-bjgh5/2"  # Your YOLOv8 LISA Model ID
CLIENT = InferenceHTTPClient(api_url="https://detect.roboflow.com", api_key=API_KEY)

def detect_objects_in_frame(frame):
    """Detect objects in a single frame using the API."""
    try:
        result = CLIENT.infer(frame, model_id=MODEL_ID)  # Send frame directly, no need to save it as an image
        frame_detections = []

        for pred in result.get("predictions", []):
            frame_detections.append({
                "label": pred["class"],
                "confidence": pred["confidence"],
                "bbox": {
                    "x": int(pred["x"]),
                    "y": int(pred["y"]),
                    "width": int(pred["width"]),
                    "height": int(pred["height"])
                }
            })

        return frame_detections  # List of detected objects

    except Exception as e:
        print(f"Error during object detection: {e}")
        return []


def process_video_frames(video_path):
    """Extracts three key frames from a video and runs object detection."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = [0, total_frames // 2, total_frames - 1]  # Beginning, middle, end
    detections = []

    for frame_idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if not ret:
            print(f"Error: Could not read frame {frame_idx}")
            continue

        frame_detections = detect_objects_in_frame(frame, frame_idx)
        if frame_detections:
            detections.append(frame_detections)

        time.sleep(0.2)  # Avoid API rate limits

    cap.release()
    return detections

# Example usage
#detections = process_video_frames(VIDEO1)
#print(detections)

print("✅ Inference complete! Detection results returned.")

ImportError: cannot import name 'COW' from 'my_vars' (/home/ubuntu/TreeHacks2025/my_vars.py)

In [47]:
import torch
import clip
import cv2
import os
import csv
import numpy as np
import pandas as pd
import concurrent.futures
import openai
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from ultralytics import YOLO

# OpenAI API Key
MY_KEY = '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'
client = openai.OpenAI(api_key=MY_KEY)

# Load Models
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(device)
clip_model, preprocess = clip.load("ViT-B/32", device=device)
yolo_model = YOLO("yolov8n.pt")  # Using YOLOv8 for car detection

# Thresholds
MIN_SHIFT_THRESHOLD = 20  
FORWARD_THRESHOLD = 1.1  
UNKNOWN_THRESHOLD = 10   

def extract_video_frames(video_path, num_frames=8):
    """Extracts 8 evenly spaced frames from the video."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_center = frame_width // 2

    frame_indices = np.linspace(0, total_frames - 1, num_frames).astype(int).tolist()
    prev_frame_data = None
    movement_summary = []
    frames = []
    detections = []

    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            print(f"Error: Could not read frame {idx}")
            continue

        # Convert to PIL format for CLIP & BLIP-2
        pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        frames.append(pil_frame)

        frame_detections = detect_objects_in_frame(frame)
        if frame_detections:
            detections.append(frame_detections)
        # Run YOLO on this frame
        curr_frame_data = detect_cars_in_frame(frame, idx, frame_center)

        # Analyze movement between frames
        if prev_frame_data is not None:
            movement, car_info = analyze_camera_movement(prev_frame_data, curr_frame_data)
            movement_summary.append((idx, movement, car_info))

        prev_frame_data = curr_frame_data

    cap.release()
    return frames, movement_summary, detections

def get_video_descriptions(frames):
    """Generates textual descriptions for frames using BLIP-2."""
    if not frames:
        return ["No description available"] * len(frames)

    inputs = processor(images=frames, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=50)
        descriptions = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return descriptions


def detect_cars_in_frame(frame, frame_num, frame_center):
    """Detect cars in a frame and categorize their positions."""
    results = yolo_model(frame)
    car_positions = {"left": [], "right": [], "front": []}

    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            label = result.names[int(box.cls[0])]

            if label == "car":
                box_center_x = (x1 + x2) // 2
                area = (x2 - x1) * (y2 - y1)  

                if box_center_x < frame_center * 0.75:
                    car_positions["left"].append((box_center_x, area))
                elif box_center_x > frame_center * 1.25:
                    car_positions["right"].append((box_center_x, area))
                else:
                    car_positions["front"].append((box_center_x, area))

    return car_positions

def analyze_camera_movement(prev_data, curr_data):
    """Analyzes camera movement using object tracking."""
    if not prev_data or not curr_data:
        return "Unknown", {}

    movement = "Unknown"
    # Get average X positions of objects in each region
    avg_prev_left = np.mean([x for x, _ in prev_data["left"]]) if prev_data["left"] else None
    avg_curr_left = np.mean([x for x, _ in curr_data["left"]]) if curr_data["left"] else None

    avg_prev_right = np.mean([x for x, _ in prev_data["right"]]) if prev_data["right"] else None
    avg_curr_right = np.mean([x for x, _ in curr_data["right"]]) if curr_data["right"] else None

    avg_prev_center = np.mean([x for x, _ in prev_data["front"]]) if prev_data["front"] else None
    avg_curr_center = np.mean([x for x, _ in curr_data["front"]]) if curr_data["front"] else None

    # Left/right movement detection
    if avg_prev_left and avg_curr_left and avg_curr_left > avg_prev_left + MIN_SHIFT_THRESHOLD:
        movement = "Moving Right"
    elif avg_prev_right and avg_curr_right and avg_curr_right < avg_prev_right - MIN_SHIFT_THRESHOLD:
        movement = "Moving Left"

    # Forward detection (More lenient)
    prev_avg_area = np.mean([area for _, area in prev_data["front"]]) if prev_data["front"] else None
    curr_avg_area = np.mean([area for _, area in curr_data["front"]]) if curr_data["front"] else None

    if prev_avg_area and curr_avg_area:
        if curr_avg_area > prev_avg_area * FORWARD_THRESHOLD:  # Only 10% increase to detect forward
            movement = "Moving Forward"

    # Strengthen "Unknown" detection
    if movement in ["Moving Left", "Moving Right", "Moving Forward"]:
        if avg_curr_center is None or avg_prev_center is None or abs(avg_curr_center - avg_prev_center) < UNKNOWN_THRESHOLD:
            movement = "Unknown"

    # Distance estimation
    car_summary = {
        "Left": {"Close": sum(1 for _, area in curr_data["left"] if area > 50000),
                 "Far": sum(1 for _, area in curr_data["left"] if area <= 50000)},
        "Right": {"Close": sum(1 for _, area in curr_data["right"] if area > 50000),
                  "Far": sum(1 for _, area in curr_data["right"] if area <= 50000)},
        "Front": {"Close": sum(1 for _, area in curr_data["front"] if area > 50000),
                  "Far": sum(1 for _, area in curr_data["front"] if area <= 50000)}
    }

    return movement, car_summary


def generate_chatgpt_responses(questions, video_summaries):
    """
    Uses GPT-4 Turbo to generate responses in parallel.
    """
    prompts = [
        f"""
        You will receive information from 8 frames of a 5-second video. The details include:

        - **Description (BLIP-2):** A broader summary of the video’s contents.
        - **LISA Detections:** Any detected street signs, along with confidence scores and bounding boxes. 
        - A "None" detection does **not** mean no signs exist.
        - LISA may confuse similar-looking signs at low confidence.
        - **Camera Movement:** Directional movement of the vehicle (ego motion). This can be unreliable.
        - **Car Summary:** The number of cars on the left, right, and front, and whether they are close or far.

        Given this information, provide **only the letter of your best answer** to the following question with no other text: 
        {question}

        **Video Summary:**
        {summary}
        """
        for question, summary in zip(questions, video_summaries)
    ]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        responses = list(executor.map(chatgpt_request, prompts))

    return responses

def chatgpt_request(prompt):
    """
    Sends a request to GPT-4o, but falls back to GPT-3.5-Turbo if unavailable.
    """
    models = ["gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo", "gpt-3.5-turbo-0125"]  # Try GPT-4o first, fallback to GPT-3.5-Turbo
    
    for model in models:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful AI that analyzes driving videos for a self-driving car."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100
            )
            return response.choices[0].message.content  # Return response if successful
        
        except openai.OpenAIError as e:
            print(f"⚠️ {model} unavailable, trying next model... (Error: {e})")
            continue  # Try the next model in the list

    return "❌ Error: Both GPT-4o and GPT-3.5-Turbo are unavailable."


def process_inputs(video_list, question_list):
    """Processes videos sequentially while ensuring synchronization between YOLO (one-by-one) and batch processing (BLIP-2 & CLIP)."""
    results = []

    for i in range(len(video_list)):
        video_path = video_list[i]
        question = question_list[i]

        if not os.path.exists(video_path):
            print(f"Skipping {video_path}: File not found")
            continue

        # Step 1: Extract Frames (All models will use these)
        frames, movement_data, detections = extract_video_frames(video_path)

        # Step 2: Batch Process CLIP and BLIP-2
        video_descriptions = get_video_descriptions(frames)  # BLIP-2 runs in batch

        # Step 3: Process YOLO One-by-One (Per Frame)
        frame_descriptions = []
        for idx, frame in enumerate(frames):
            _, movement, car_summary = movement_data[idx] if idx < len(movement_data) else (idx, "Unknown", {})
            frame_detections = detections[idx] if idx < len(detections) else []

            # Ensure YOLO results align with frame order
            frame_info = (
                f"Frame {idx+1}: \n"
                f" - Description (BLIP-2): {video_descriptions[idx]}\n"
                f" - LISA Detections: {', '.join([str(detection) for detection in frame_detections]) if frame_detections else 'None'}\n"
                f" - Camera Movement: {movement}\n"
                f" - Car Summary: {car_summary}\n"
            )
            frame_descriptions.append(frame_info)

        # Step 4: Format All Frame Data for ChatGPT
        formatted_description = "\n".join(frame_descriptions)
        answer = generate_chatgpt_responses([question], [formatted_description])[0]
        print(answer)
        answer = answer[0] if len(answer) >= 2 else answer
        if (answer != 'A') and (answer != 'B') and (answer != 'C') and (answer != 'D'):
            answer = 'C'
        print(answer)
        video_id = os.path.basename(video_path).split("_")[-1].split(".")[0]
        results.append((video_id, answer))
        print(results)

    return results


def save_results_to_csv(results, output_csv="/mnt/data/output.csv"):
    """
    Saves processed video results to a CSV file using Pandas.

    :param results: List of tuples (video_id, answer).
    :param output_csv: Path for the output CSV file.
    :return: File path of the saved CSV file.
    """
    # Convert results to a Pandas DataFrame
    df = pd.DataFrame(results, columns=["id", "answer"])

    # Save to CSV
    df.to_csv(output_csv, index=False)  # `index=False` prevents adding an extra index column

    print(f"Results saved to {output_csv}")
    return output_csv  # Return the file path

# Load Videos
video_directory = "/home/ubuntu/TreeHacks2025/data/videos/videos"
video_files = sorted(
    [os.path.join(video_directory, f) for f in os.listdir(video_directory) if f.endswith(".mp4")]
)

# Load Questions from CSV
question_file = "/home/ubuntu/TreeHacks2025/data/questions.csv"
questions = []

with open(question_file, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row if it exists
    for row in reader:
        if row:  # Ensure row is not empty
            questions.append(row[1])  # Assuming questions are in the second column

# Run sequential processing
video_results = process_inputs(video_files, questions)

csv_file_path = save_results_to_csv(video_results)

# Show the file path
csv_file_path


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.30it/s]



0: 416x640 4 cars, 3.4ms
Speed: 1.8ms preprocess, 3.4ms inference, 0.8ms postprocess per image at shape (1, 3, 416, 640)


KeyboardInterrupt: 