# Behavioral Video Annotator and Analysis using Google Gemini VLM
Recommended: Open this in Google Colab and run there

# Connect Google Drive with Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Dependencies

In [None]:
!pip install -q google-generativeai
!pip install -q boxsdk
import google.generativeai as genai

# Enter Your Gemini and BOX Client API keys Here
genai.configure(api_key="YOUR_GEMINI_API_KEY")
BOX_CLIENT_ID = "YOUR_BOX_CLIENT_ID"
BOX_CLIENT_SECRET = "YOUR_BOX_CLIENT_SECRET"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/141.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m133.1/141.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.3/141.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import json
import os
import time
import requests
from pathlib import Path

# Box SDK for OAuth
from boxsdk import OAuth2, Client
from boxsdk.exception import BoxAPIException

# Gemini VLM Prompt

In [None]:
# Edit this prompt as needed for your specific annotation task
def create_hri_annotation_prompt():
    """Create optimized prompt for HRI video analysis"""
    return """
You are an expert ethnographic researcher analyzing trash barrel robot interactions in NYC public spaces.

ANALYZE this video comprehensively, paying special attention to:
1. VERBAL CONTENT - What people say about/to the robot
2. EMOTIONAL REACTIONS - Tone, body language, facial expressions
3. SOCIAL DYNAMICS - How robot affects group interactions
4. BEHAVIORAL DETAILS - Subtle actions and reactions

For each interaction, provide:

REQUIRED FIELDS:
- start_time: "MM:SS" format
- end_time: "MM:SS" format
- interaction_type: [approaching, avoiding, photographing, pointing, talking, helping, throwing_trash, looking, ignoring, crowd_formation]
- detailed_observations: Rich description including dialogue, body language, tone, context
- dialogue_captured: Direct quotes if people speak (use "..." if unclear)
- emotional_reaction: [very_positive, positive, neutral, negative, very_negative, mixed, unclear]
- confidence_score: 0.0-1.0

OBSERVATION GUIDELINES:
- Include exact quotes when possible: "Get out of here robot!"
- Describe tone: enthusiastic, hesitant, annoyed, curious, playful, dismissive
- Note body language: leaning in, stepping back, gesturing, facial expressions
- Capture group dynamics: who influences whom, social contagion effects
- Include context: environmental factors, time of day, crowd density

EMOTIONAL REACTION CRITERIA:
- very_positive: excitement, delight, eager engagement ("This is so cool!")
- positive: curiosity, amusement, willing participation
- neutral: matter-of-fact interaction, functional use
- negative: annoyance, avoidance, complaints ("Why is this here?")
- very_negative: anger, hostility, aggressive behavior
- mixed: conflicted reactions, changes during interaction
- unclear: insufficient evidence to determine sentiment

EXAMPLE ANNOTATION:
{
  "start_time": "02:15",
  "end_time": "02:45",
  "interaction_type": "talking",
  "detailed_observations": "Young woman approaches robot hesitantly, says 'What is this thing?' in curious but slightly nervous tone. Takes out phone to photograph. Friend joins and says 'It's like a Roomba for trash!' Both laugh. Woman throws empty coffee cup in robot, says 'Thanks little guy!' in playful tone while patting robot's side. Both walk away looking back and smiling.",
  "dialogue_captured": "What is this thing? ... It's like a Roomba for trash! ... Thanks little guy!",
  "emotional_reaction": "positive",
  "confidence_score": 0.9
}

SPECIAL ATTENTION TO:
- Sarcasm or negative comments with positive tone (mark as "mixed")
- Children's reactions vs. adult reactions
- People explaining robot to others
- Changes in reaction during interaction
- Group influence on individual reactions

Return JSON array of all interactions. Focus on quality over quantity - better to have fewer, richly detailed annotations than many superficial ones.
"""

In [None]:
def analyze_video_with_gemini(video_file, model_name="gemini-1.5-flash"):
    """Analyze entire video with Gemini VLM"""
    model = genai.GenerativeModel(model_name)
    prompt = create_hri_annotation_prompt()

    print("Analyzing video with Gemini...")

    try:
        # generate content with video + prompt
        response = model.generate_content([
            video_file,
            prompt
        ])

        # extract JSON from response
        response_text = response.text
        print("Raw response:", response_text[:200] + "..." if len(response_text) > 200 else response_text)

        # find JSON in response
        json_start = response_text.find('[')
        json_end = response_text.rfind(']') + 1

        if json_start != -1 and json_end > json_start:
            json_str = response_text[json_start:json_end]
            annotations = json.loads(json_str)
            return annotations
        else:
            print("No JSON found in response")
            return []

    except Exception as e:
        print(f"Error analyzing video: {e}")
        return []

# Research Format Dataframe

In [None]:
# Edit this function if your research format differs
def create_annotation_dataframe(annotations, video_name):
    """Convert annotations to DataFrame matching your research format"""
    if not annotations:
        return pd.DataFrame()

    data = []
    for ann in annotations:
        data.append({
            'tcn_layers': video_name,
            'Start Time': ann.get('start_time', ''),
            'End Time': ann.get('end_time', ''),
            'Event': ann.get('interaction_type', ''),
            'Observations': ann.get('detailed_observations', ''),
            'Dialogue Captured': ann.get('dialogue_captured', ''),
            'Emotional Reaction': ann.get('emotional_reaction', ''),
            'Initiation Action': '',  # Can be filled later if needed
            'Ending Action': '',     # Can be filled later if needed
            'Confidence': ann.get('confidence_score', 0.0)
        })

    return pd.DataFrame(data)


# Gets Video Duration and Splits Into Chunks

In [None]:
import subprocess

def get_video_duration_minutes(video_path):
    """Get actual video duration using ffprobe"""
    try:
        cmd = [
            'ffprobe', '-v', 'quiet', '-print_format', 'json',
            '-show_format', str(video_path)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        info = json.loads(result.stdout)
        duration_seconds = float(info['format']['duration'])
        return duration_seconds / 60  # Convert to minutes
    except:
        # Fallback: estimate from file size (rough approximation)
        file_size_gb = os.path.getsize(video_path) / (1024**3)
        estimated_minutes = file_size_gb * 2.2  # Rough estimate: ~2.2 min per GB
        print(f"⚠️ Could not get exact duration, estimating ~{estimated_minutes:.0f} minutes")
        return estimated_minutes

def check_file_size_and_decide_processing(video_path):
    """Check if file needs chunking and calculate optimal chunk duration - works for videos of all lengths"""
    file_size_bytes = os.path.getsize(video_path)
    file_size_gb = file_size_bytes / (1024**3)

    print(f"📁 Video file size: {file_size_gb:.2f} GB")

    # Gemini limit is 2GB
    if file_size_bytes <= 1900000000:  # 1.9GB buffer
        return False, file_size_gb, None

    # Get actual video duration
    video_duration_minutes = get_video_duration_minutes(video_path)
    print(f"⏱️ Video duration: {video_duration_minutes:.1f} minutes")

    # Calculate chunks needed to stay under 1.8GB each
    target_chunk_size_gb = 1.8
    min_chunks_needed = int(file_size_gb / target_chunk_size_gb) + 1

    # Calculate optimal chunk duration
    optimal_chunk_minutes = max(2, int(video_duration_minutes / min_chunks_needed))

    print(f"📊 Need {min_chunks_needed} chunks of {optimal_chunk_minutes} minutes each")

    return True, file_size_gb, optimal_chunk_minutes

def split_video_into_chunks(video_path, chunk_duration_minutes):
    """Split video into chunks under 2GB"""

    print(f"✂️ Splitting into {chunk_duration_minutes}-minute chunks...")

    # Install ffmpeg if needed
    try:
        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
    except:
        print("📦 Installing FFmpeg...")
        subprocess.run(['apt', 'update', '-qq'], check=True)
        subprocess.run(['apt', 'install', '-y', 'ffmpeg'], check=True)

    # Create chunks directory
    video_path = Path(video_path)
    chunks_dir = video_path.parent / f"{video_path.stem}_chunks"
    chunks_dir.mkdir(exist_ok=True)

    # Split video
    chunk_duration_seconds = chunk_duration_minutes * 60
    output_pattern = str(chunks_dir / "chunk_%03d.mp4")

    cmd = [
        'ffmpeg', '-i', str(video_path),
        '-c', 'copy',
        '-map', '0',
        '-segment_time', str(chunk_duration_seconds),
        '-f', 'segment',
        '-reset_timestamps', '1',
        output_pattern,
        '-y'
    ]

    print("🔄 Splitting video...")
    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        print(f"❌ Video splitting failed: {result.stderr}")
        return []

    # Get created chunks and verify sizes
    chunk_files = sorted(list(chunks_dir.glob("chunk_*.mp4")))

    print(f"✅ Created {len(chunk_files)} chunks:")
    valid_chunks = []

    for i, chunk in enumerate(chunk_files):
        size_gb = os.path.getsize(chunk) / (1024**3)
        size_mb = size_gb * 1024

        if size_gb < 1.9:  # Under 1.9GB
            print(f"   ✅ Chunk {i+1}: {size_mb:.0f} MB")
            valid_chunks.append(chunk)
        else:
            print(f"   ⚠️  Chunk {i+1}: {size_mb:.0f} MB (too large, skipping)")

    return valid_chunks

In [None]:
def process_chunks_and_combine(chunk_files, original_video_name, chunk_duration_minutes=8):
    """Process all chunks and combine results with adjusted timestamps"""

    print(f"\n🚀 Processing {len(chunk_files)} chunks with Gemini...")

    all_annotations = []

    for i, chunk_path in enumerate(chunk_files):
        print(f"\n--- Processing Chunk {i+1}/{len(chunk_files)} ---")

        try:
            # Upload chunk to Gemini
            chunk_name = f"{original_video_name}_chunk_{i+1}"
            uploaded_file = genai.upload_file(path=str(chunk_path), display_name=chunk_name)

            # Wait for processing
            print("⏳ Processing with Gemini...")
            while uploaded_file.state.name == "PROCESSING":
                print(".", end="")
                time.sleep(5)
                uploaded_file = genai.get_file(uploaded_file.name)

            if uploaded_file.state.name == "FAILED":
                print(f"\n❌ Chunk {i+1} processing failed")
                continue

            print(f"\n✅ Gemini processing complete!")

            # Analyze chunk with your existing function
            annotations = analyze_video_with_gemini(uploaded_file)

            # Adjust timestamps to reflect position in original video
            chunk_start_minutes = i * chunk_duration_minutes

            for ann in annotations:
                # Parse current timestamp and add chunk offset
                start_time = ann.get('start_time', '0:00')
                end_time = ann.get('end_time', '0:00')

                # Convert MM:SS to total minutes, add chunk offset
                start_parts = start_time.split(':')
                start_total_min = int(start_parts[0]) + chunk_start_minutes
                start_sec = int(start_parts[1]) if len(start_parts) > 1 else 0

                end_parts = end_time.split(':')
                end_total_min = int(end_parts[0]) + chunk_start_minutes
                end_sec = int(end_parts[1]) if len(end_parts) > 1 else 0

                # Update timestamps to reflect original video timeline
                ann['start_time'] = f"{start_total_min}:{start_sec:02d}"
                ann['end_time'] = f"{end_total_min}:{end_sec:02d}"
                ann['chunk_number'] = i + 1

            all_annotations.extend(annotations)
            print(f"✅ Chunk {i+1}: Found {len(annotations)} interactions")

        except Exception as e:
            print(f"❌ Chunk {i+1} failed: {e}")
            continue

    return all_annotations

Box Auth Integration

In [None]:
class BoxVideoDownloader:
    def __init__(self, client_id, client_secret):
        """Initialize Box OAuth client"""
        self.client_id = client_id
        self.client_secret = client_secret
        self.box_client = None

    def authenticate_with_box(self):
        """Complete OAuth flow - opens browser for user login"""

        print("🔐 Starting Box OAuth 2.0 authentication...")

        # Create OAuth2 object
        oauth = OAuth2(
            client_id=self.client_id,
            client_secret=self.client_secret
        )

        # Get authorization URL
        auth_url, csrf_token = oauth.get_authorization_url('https://irl.tech.cornell.edu/auto-ethnography-vlm/box-oauth-redirect.html')

        print(f"\n🌐 PLEASE VISIT THIS URL TO AUTHORIZE:")
        print(f"{auth_url}")
        print("\nAfter you log in, you'll be redirected to a URL.")
        print("Copy the ENTIRE URL from your browser and paste it below:")

        # Get authorization code from user
        redirect_url = input("\nPaste the redirect URL here: ").strip()

        try:
            # Extract authorization code from URL
            if 'code=' in redirect_url:
                auth_code = redirect_url.split('code=')[1].split('&')[0]
            else:
                raise ValueError("No authorization code found in URL")

            # Exchange code for access token
            access_token, refresh_token = oauth.authenticate(auth_code)

            # Create Box client
            self.box_client = Client(oauth)

            # Test connection
            user = self.box_client.user().get()
            print(f"\n✅ Successfully authenticated as: {user.name}")
            return True

        except Exception as e:
            print(f"❌ Authentication failed: {e}")
            return False

    def download_file_by_id(self, file_id, local_path):
        """Download Box file using file ID"""

        if not self.box_client:
            print("❌ Not authenticated with Box")
            return None

        try:
            # Get file object
            file_obj = self.box_client.file(file_id).get()

            print(f"📥 Downloading: {file_obj.name}")
            print(f"📁 File size: {file_obj.size / (1024**3):.2f} GB")

            # Download file
            with open(local_path, 'wb') as local_file:
                file_obj.download_to(local_file)

            # Verify download
            downloaded_size = os.path.getsize(local_path)
            print(f"✅ Download complete: {downloaded_size / (1024**3):.2f} GB")

            return local_path

        except Exception as e:
            print(f"❌ Download failed: {e}")
            return None

In [None]:
# List and delete all files in your Gemini storage
# Uncomment and run this block to clean up your Gemini cloud storage if needed, it would tell you if you have the storage is full!

# files = genai.list_files()
# for file in files:
#     print(f"Deleting: {file.display_name}")
#     genai.delete_file(file.name)
# print("✅ All files cleaned up!")

Deleting: box_video_1939554923607_chunk_9
Deleting: box_video_1939554923607_chunk_3
Deleting: box_video_1939554923607_chunk_2
Deleting: box_video_1939554923607_chunk_1
Deleting: box_video_1939549048509_chunk_13
Deleting: box_video_1939549048509_chunk_12
Deleting: box_video_1939549048509_chunk_11
Deleting: box_video_1939549048509_chunk_10
Deleting: box_video_1939549048509_chunk_9
Deleting: box_video_1939549048509_chunk_8
Deleting: box_video_1939549048509_chunk_7
Deleting: box_video_1939549048509_chunk_6
Deleting: box_video_1939549048509_chunk_5
Deleting: box_video_1939549048509_chunk_4
Deleting: box_video_1939549048509_chunk_3
Deleting: box_video_1939549048509_chunk_2
Deleting: box_video_1939549048509_chunk_1
✅ All files cleaned up!


In [None]:
def complete_box_to_annotations_pipeline(box_file_id, output_excel_path):
    """
    COMPLETE PIPELINE: Box OAuth → Download → Chunk if needed → Gemini → Excel

    Args:
        box_file_id: Box file ID (from the URL when viewing file in Box)
        output_excel_path: Where to save final Excel annotations
    """

    print("🎬 COMPLETE BOX-TO-ANNOTATIONS PIPELINE")
    print("="*60)

    # Step 1: Authenticate with Box
    print("STEP 1: Box Authentication")
    downloader = BoxVideoDownloader(BOX_CLIENT_ID, BOX_CLIENT_SECRET)

    if not downloader.authenticate_with_box():
        print("❌ Box authentication failed")
        return None, None

    # Step 2: Download video from Box
    print(f"\nSTEP 2: Download Video from Box")
    temp_video_path = f"/content/downloaded_video.mp4"

    downloaded_path = downloader.download_file_by_id(box_file_id, temp_video_path)

    if not downloaded_path:
        print("❌ Video download failed")
        return None, None

    # Step 3: Check if chunking is needed
    print(f"\nSTEP 3: Check File Size and Processing Method")
    needs_chunking, file_size_gb, chunk_duration = check_file_size_and_decide_processing(downloaded_path)

    video_name = f"box_video_{box_file_id}"

    if needs_chunking:
        print(f"📁 File is {file_size_gb:.2f} GB - using chunking method")

        # Step 4a: Split into chunks
        print(f"\nSTEP 4: Split Video into Chunks")
        chunk_files = split_video_into_chunks(downloaded_path, chunk_duration)

        if not chunk_files:
            print("❌ Video chunking failed")
            return None, None

        # Step 5a: Process chunks and combine
        print(f"\nSTEP 5: Process Chunks with Gemini")
        all_annotations = process_chunks_and_combine(chunk_files, video_name)

    else:
        print(f"✅ File is {file_size_gb:.2f} GB - processing directly")

        # Step 4b: Process directly with Gemini
        print(f"\nSTEP 4: Process Video with Gemini")
        uploaded_file = genai.upload_file(path=downloaded_path, display_name=video_name)

        # Wait for processing
        while uploaded_file.state.name == "PROCESSING":
            print(".", end="")
            time.sleep(5)
            uploaded_file = genai.get_file(uploaded_file.name)

        # Analyze with your existing function
        all_annotations = analyze_video_with_gemini(uploaded_file)

    # Step 6: Create Excel output
    print(f"\nSTEP 6: Create Excel Annotations")

    if all_annotations:
        # Create DataFrame with your existing function
        df = create_annotation_dataframe(all_annotations, video_name)

        # Add chunk info if chunked
        if needs_chunking:
            chunk_numbers = [ann.get('chunk_number', 1) for ann in all_annotations]
            df['Chunk_Number'] = chunk_numbers

        # Save to Excel
        df.to_excel(output_excel_path, index=False)

        print(f"✅ SUCCESS!")
        print(f"📊 Total interactions found: {len(all_annotations)}")
        print(f"💾 Results saved to: {output_excel_path}")

        # Show sample results
        print(f"\n📋 SAMPLE INTERACTIONS:")
        for i in range(min(3, len(all_annotations))):
            ann = all_annotations[i]
            print(f"{i+1}. {ann.get('start_time')}-{ann.get('end_time')}: {ann.get('interaction_type')}")
            print(f"   Emotion: {ann.get('emotional_reaction')} (confidence: {ann.get('confidence_score', 0):.2f})")
            if ann.get('dialogue_captured'):
                print(f"   Quote: \"{ann.get('dialogue_captured', '')[:50]}...\"")
            print()
    else:
        print("❌ No interactions found")
        df, all_annotations = None, None

    # Step 7: Clean up temporary files
    print(f"\nSTEP 7: Cleanup")
    try:
        os.remove(downloaded_path)
        print("🗑️ Temporary video file cleaned up")
    except:
        pass

    return df, all_annotations

In [None]:

print("🚀 READY TO PROCESS BOX VIDEOS!")
print("="*60)
print("To get your Box file ID:")
print("1. Go to your video in Box web interface")
print("2. Look at the URL - it will be like: https://cornell.box.com/file/123456789")
print("3. The number at the end (123456789) is your file ID")
print()

# CONFIGURE YOUR VALUES HERE:
BOX_FILE_ID = "1939554923607"  # Replace with actual file ID
OUTPUT_PATH = "/content/drive/MyDrive/HRI-Annotation/astoria-park-715-landfill-014.xlsx" # Where to save Excel file in your Google Drive

# Uncomment the line below when you're ready to run:
df, annotations = complete_box_to_annotations_pipeline(BOX_FILE_ID, OUTPUT_PATH)

🚀 READY TO PROCESS BOX VIDEOS!
To get your Box file ID:
1. Go to your video in Box web interface
2. Look at the URL - it will be like: https://cornell.box.com/file/123456789
3. The number at the end (123456789) is your file ID

🎬 COMPLETE BOX-TO-ANNOTATIONS PIPELINE
STEP 1: Box Authentication
🔐 Starting Box OAuth 2.0 authentication...

🌐 PLEASE VISIT THIS URL TO AUTHORIZE:
https://account.box.com/api/oauth2/authorize?state=box_csrf_token_Kri2GrJIuQzxJWmx&response_type=code&client_id=4w0jjxf4r496wt5vhyg10rkw243w0m61&redirect_uri=https%3A%2F%2Firl.tech.cornell.edu%2Fauto-ethnography-vlm%2Fbox-oauth-redirect.html

After you log in, you'll be redirected to a URL.
Copy the ENTIRE URL from your browser and paste it below:

✅ Successfully authenticated as: Audrey Tjokro

STEP 2: Download Video from Box
📥 Downloading: VID_20230924_153911_00_014.mp4
📁 File size: 13.56 GB
✅ Download complete: 13.56 GB

STEP 3: Check File Size and Processing Method
📁 Video file size: 13.56 GB
⏱️ Video duration: 2

ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 71879.07ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 91165.80ms
ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 66725.38ms


Raw response: ```json
[
  {
    "start_time": "00:00",
    "end_time": "00:17",
    "interaction_type": "talking",
    "detailed_observations": "A woman with dark hair and glasses approaches the robot, holding a la...
✅ Chunk 1: Found 5 interactions

--- Processing Chunk 2/9 ---
⏳ Processing with Gemini...
.....................
✅ Gemini processing complete!
Analyzing video with Gemini...
Raw response: ```json
[
  {
    "start_time": "00:00",
    "end_time": "00:08",
    "interaction_type": "approaching",
    "detailed_observations": "A man and woman walk past the robot, pushing a wagon with a child...
✅ Chunk 2: Found 6 interactions

--- Processing Chunk 3/9 ---
⏳ Processing with Gemini...
..................
✅ Gemini processing complete!
Analyzing video with Gemini...
Raw response: ```json
[
  {
    "start_time": "00:00",
    "end_time": "00:02",
    "interaction_type": "approaching",
    "detailed_observations": "A woman with a child in a tricycle approaches the robot. She appe...
✅ C

After installing the `boxsdk`, you can run the cell with the imports again.