In [1]:
import os
import pandas as pd
import numpy as np
import glob
import base64
import anthropic
from typing import List, Dict, Tuple
import time
import ast
import re
import logging
from datetime import datetime

# MODEL_NAME = "claude-3-5-sonnet-20240620"
# MODEL_NAME = "claude-3-5-sonnet-20241022"
MODEL_NAME = "claude-3-7-sonnet-20250219"

# Configure logging
def setup_logging(log_dir):
    """Set up logging with timestamp in filename"""
    os.makedirs(log_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(log_dir, f"{MODEL_NAME}_qa_log_{timestamp}.txt")
    
    # Configure logger
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()  # Also print to console
        ]
    )
    
    return logging.getLogger()

def encode_image(image_path):
    """Encode an image to base64 string"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def get_frames_for_range(video_name, frame_range, frames_dir, num_frames=5, logger=None):
    """Get equally spaced frames from a frame range"""
    start_frame, end_frame = map(int, frame_range.split('-'))
    all_frames = list(range(start_frame, end_frame + 1))
    
    # Select equally spaced frames
    if len(all_frames) <= num_frames:
        selected_indices = all_frames
    else:
        # This ensures we get the start, end, and equally spaced frames in between
        selected_indices = np.linspace(0, len(all_frames)-1, num_frames, dtype=int)
        selected_indices = [all_frames[i] for i in selected_indices]
    
    # Get the image paths
    frame_paths = []
    for frame_idx in selected_indices:
        frame_path = os.path.join(frames_dir, video_name, f"{frame_idx:06d}.png")
        if os.path.exists(frame_path):
            frame_paths.append(frame_path)
        else:
            if logger:
                logger.warning(f"Frame {frame_idx:06d} not found for video {video_name}")
            else:
                print(f"Warning: Frame {frame_idx:06d} not found for video {video_name}")
    
    return frame_paths

def generate_answers(images: List[str], questions: List[str], client, logger) -> List[str]:
    """Query Claude with the images and questions to get answers"""
    image_contents = [encode_image(img) for img in images]
    
    prompt = f"""
    Here is a sequence of 5 frames extracted from a video. The frames are consecutive frames from a video and thus contain temporal information. The camera is positioned at the center in first-person view. 
    Your task is to:
    1. Analyze the images provided and answer the given question.
    2. Answer each question accurately based on the images.
    3. Provide concise and clear answers.
    Questions:
    {questions}
    Please provide your answers in a list format, where each answer corresponds to the question in the same order. The answer should be either Yes or No.
    Your response should be a Python list of strings, each string being an answer. For example:
    ["Answer to question 1", "Answer to question 2", "Answer to question 3"]
    The example of value would be: ["Yes", "No", "Yes"]
    Respond with only the Python list of answers, no additional text.
    """
    
    logger.info(f"Sending prompt to Claude with {len(questions)} questions: {questions}")
    
    max_retries = 3
    retries = 0
    
    while retries < max_retries:
        try:
            message = client.messages.create(
                model=MODEL_NAME,
                max_tokens=2000,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            *[{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img}} for img in image_contents]
                        ]
                    }
                ]
            )
            
            content = message.content[0].text
            logger.info(f"Claude's raw response: {content}")
            
            # Try to evaluate as a Python list
            try:
                # First, try direct eval
                answers = ast.literal_eval(content)
                if isinstance(answers, list) and all(isinstance(a, str) for a in answers):
                    logger.info(f"Successfully parsed answers: {answers}")
                    return answers
                else:
                    raise ValueError("Not a list of strings")
            except:
                # If that fails, try regex to extract a list pattern
                list_pattern = r'\[(.*?)\]'
                match = re.search(list_pattern, content, re.DOTALL)
                if match:
                    items = match.group(1).split(',')
                    answers = [item.strip().strip('"\'') for item in items]
                    if len(answers) == len(questions):
                        logger.info(f"Extracted answers using regex: {answers}")
                        return answers
                
                logger.warning(f"Retry {retries+1}: Could not parse Claude's response.")
                retries += 1
                time.sleep(2)  # Wait a bit before retrying
        
        except anthropic.APIError as e:
            logger.error(f"Anthropic API Error: {str(e)}")
            retries += 1
            time.sleep(5)  # Longer wait for API errors
    
    # If we've exhausted retries, return empty list
    logger.error("Failed to get valid response after maximum retries")
    return []

def convert_yes_no_to_binary(answers, logger):
    """Convert 'Yes'/'No' answers to 1/0"""
    binary_answers = []
    
    for answer in answers:
        answer = answer.strip().lower()
        if 'yes' in answer:
            binary_answers.append(1)
        else:
            binary_answers.append(0)
    
    logger.info(f"Converted answers to binary: {binary_answers}")        
    return binary_answers

def process_questions_csv(csv_path, frames_dir, output_csv_path, log_dir):
    """Process all questions in the CSV file"""
    # Set up logging
    logger = setup_logging(log_dir)
    logger.info(f"Starting processing of questions from {csv_path}")
    
    # Read the CSV file
    df = pd.read_csv(csv_path)
    logger.info(f"Loaded CSV with {len(df)} rows")
    
    # Initialize Anthropic client
    client = anthropic.Anthropic()
    
    # Add Claude column if it doesn't exist
    if MODEL_NAME not in df.columns:
        df[MODEL_NAME] = ""  # np.nan
        logger.info(f"Added '{MODEL_NAME}' column to DataFrame")
    
    # Group by Video and Frame
    groups = df.groupby(['Video', 'Frame'], dropna=False)
    logger.info(f"Found {len(groups)} unique video/frame groups")
    
    for (vid, frame_range), group in groups:
        video = f"{int(vid):04d}"
        if pd.isna(video) or pd.isna(frame_range):
            continue
            
        logger.info(f"Processing video {video}, frames {frame_range}")
        
        # Get the questions for this frame range
        questions = group['Questions'].tolist()
        logger.info(f"Questions: {questions}")
        
        # Get frame paths
        frame_paths = get_frames_for_range(video, frame_range, frames_dir, logger=logger)

        # print(vid, frame_range, frame_paths)
        # continue
        
        if not frame_paths or len(frame_paths) == 0:
            logger.error(f"No frames found for video {video}, frame range {frame_range}")
            continue
            
        logger.info(f"Using {len(frame_paths)} frames: {[os.path.basename(p) for p in frame_paths]}")
        
        # Generate answers
        answers = generate_answers(frame_paths, questions, client, logger)
        
        if not answers or len(answers) != len(questions):
            logger.error(f"Error: Did not get valid answers for video {video}, frame range {frame_range}")
            continue
            
        # Convert to binary
        binary_answers = convert_yes_no_to_binary(answers, logger)
        
        # Update the DataFrame
        idx = group.index
        df.loc[idx, MODEL_NAME] = [str(int(b_a)) for b_a in binary_answers]
        
        # Save after each group to preserve progress
        df.to_csv(output_csv_path, index=False)
        logger.info(f"Saved progress to {output_csv_path}")
        
        logger.info(f"Completed {video} {frame_range}. Answers: {answers}")
        logger.info(f"Binary: {binary_answers}")
        
        # Wait a bit to avoid rate limits
        time.sleep(1)
    
    # Final save
    df['Video'] = [f"{int(vd):04d}" for vd in df['Video']]
    df.to_csv(output_csv_path, index=False)
    logger.info(f"All processing complete. Results saved to {output_csv_path}")


In [3]:
import argparse
    
# parser = argparse.ArgumentParser(description="Process video frame questions with Claude")
# parser.add_argument("--csv", required=True, help="Path to input CSV file with questions")
# parser.add_argument("--frames_dir", required=True, help="Directory containing video frames")
# parser.add_argument("--output", required=True, help="Path to output CSV file")

csv = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kitti_questions/all_que_fn_model_ans.csv"
frames_dir = "/data/datasets/KITTI/STEP/testing/image_02"
output = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kitti_questions/all_que_fn_model_ans.csv"
log_dir = "/home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kb_inference/log_dir"
# args = parser.parse_args()

process_questions_csv(csv, frames_dir, output, log_dir)

2025-02-28 03:01:40,240 - INFO - Starting processing of questions from /home/ibk5106/projects/projects/LogicRAG/tools/kb_framework/kitti_questions/all_que_fn_model_ans.csv
2025-02-28 03:01:40,244 - INFO - Loaded CSV with 101 rows
2025-02-28 03:01:40,293 - INFO - Added 'claude-3-7-sonnet-20250219' column to DataFrame
2025-02-28 03:01:40,295 - INFO - Found 19 unique video/frame groups
2025-02-28 03:01:40,296 - INFO - Processing video 0000, frames 000060-000069
2025-02-28 03:01:40,297 - INFO - Questions: ['Can you spot a pedestrian at the right in the last few frames?', 'Is there a White Car on the right of the frame?', 'Does the distance between the pedestrian and the White Car remains constant?', 'Is there a Black Car on the left the scene?', 'Does the Black Car disappear from the scene?', 'Is there a Black Car on the left of the scene?', 'Is there a White Car on the left of the scene?']
2025-02-28 03:01:40,297 - INFO - Using 5 frames: ['000060.png', '000062.png', '000064.png', '000066.