In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q google-generativeai
import google.generativeai as genai

genai.configure(api_key="AIzaSyDfRpbW9jf5hY9UpkPkvy_mhWJA2rVBpvs")

In [3]:
import pandas as pd
import json
import time
from pathlib import Path

def upload_video_to_gemini(video_path):
    """Upload video to Gemini and wait for processing"""
    print(f"Uploading {video_path}...")

    # Upload the video file
    uploaded_file = genai.upload_file(
        path=video_path,
        display_name=Path(video_path).name
    )

    print("Waiting for video processing...")
    # Wait for processing to complete
    while uploaded_file.state.name == "PROCESSING":
        print(".", end="")
        time.sleep(2)
        uploaded_file = genai.get_file(uploaded_file.name)

    if uploaded_file.state.name == "FAILED":
        raise ValueError(f"Video processing failed: {uploaded_file.state}")

    print(f"\nVideo processed successfully!")
    return uploaded_file


In [4]:
def create_hri_annotation_prompt():
    """Create optimized prompt for HRI video analysis"""
    return """
You are an expert HRI researcher analyzing trash barrel robot interactions in NYC.

ANALYZE this entire video and identify ALL human-robot interactions chronologically.

For each interaction, provide:
- start_time: "MM:SS" format when interaction begins
- end_time: "MM:SS" format when interaction ends
- interaction_type: choose from [approaching, avoiding, photographing, pointing, talking, helping, throwing_trash, looking, crowd_formation, no_clear_interaction]
- detailed_description: objective description of what happens
- person_count: number of people involved
- confidence_score: 0.0-1.0 based on clarity of interaction

IMPORTANT GUIDELINES:
- Include ALL visible interactions, even brief ones (2-3 seconds)
- Look for people taking photos, pointing, approaching robot
- Note conversations about the robot (use audio cues)
- Track if same people interact multiple times
- Be precise with timestamps

Return as JSON array:
[
  {
    "start_time": "00:15",
    "end_time": "00:23",
    "interaction_type": "photographing",
    "detailed_description": "Woman in blue jacket takes photo of robot with phone",
    "person_count": 1,
    "confidence_score": 0.9
  }
]

If no interactions found, return empty array: []
"""

In [5]:
def analyze_video_with_gemini(video_file, model_name="gemini-1.5-flash"):
    """Analyze entire video with Gemini VLM"""
    model = genai.GenerativeModel(model_name)
    prompt = create_hri_annotation_prompt()

    print("Analyzing video with Gemini...")

    try:
        # generate content with video + prompt
        response = model.generate_content([
            video_file,
            prompt
        ])

        # extract JSON from response
        response_text = response.text
        print("Raw response:", response_text[:200] + "..." if len(response_text) > 200 else response_text)

        # find JSON in response
        json_start = response_text.find('[')
        json_end = response_text.rfind(']') + 1

        if json_start != -1 and json_end > json_start:
            json_str = response_text[json_start:json_end]
            annotations = json.loads(json_str)
            return annotations
        else:
            print("No JSON found in response")
            return []

    except Exception as e:
        print(f"Error analyzing video: {e}")
        return []

In [6]:
def create_annotation_dataframe(annotations, video_name):
    """Convert annotations to DataFrame matching your research format"""
    if not annotations:
        return pd.DataFrame()

    data = []
    for ann in annotations:
        data.append({
            'tcn_layers': video_name,
            'Start Time': ann.get('start_time', ''),
            'End Time': ann.get('end_time', ''),
            'Event': ann.get('interaction_type', ''),
            'Observations': ann.get('detailed_description', ''),
            'Initiation Action': '',  # Can be filled manually or with follow-up prompts
            'Ending Action': '',     # Can be filled manually or with follow-up prompts
            'Person Count': ann.get('person_count', 1),
            'Confidence': ann.get('confidence_score', 0.0)
        })

    return pd.DataFrame(data)


In [7]:
def process_video(video_path, output_path):
    """Main function to process video and create annotations"""

    # upload video to Gemini
    video_file = upload_video_to_gemini(video_path)

    # analyze with Gemini VLM
    annotations = analyze_video_with_gemini(video_file)

    print(f"Found {len(annotations)} interactions")

    # convert to DataFrame
    video_name = Path(video_path).name
    df = create_annotation_dataframe(annotations, video_name)

    # save to Excel
    df.to_excel(output_path, index=False)
    print(f"Annotations saved to {output_path}")

    # display results
    print("\nDetected Interactions:")
    for i, ann in enumerate(annotations):
        print(f"{i+1}. {ann['start_time']}-{ann['end_time']}: {ann['interaction_type']} - {ann['detailed_description']}")

    return df, annotations

# run the analysis
video_path = "/content/drive/MyDrive/HRI-Annotation/sample_clips/sample_clip.mp4"
output_path = "/content/drive/MyDrive/HRI-Annotation/sample_annotations_output.xlsx"

# process the video
df, annotations = process_video(video_path, output_path)

# display the DataFrame
print("\nFinal DataFrame:")
print(df)

Uploading /content/drive/MyDrive/HRI-Annotation/sample_clips/sample_clip.mp4...
Waiting for video processing...
........................
Video processed successfully!
Analyzing video with Gemini...


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 35774.30ms


Raw response: ```json
[
  {
    "start_time": "00:24",
    "end_time": "00:26",
    "interaction_type": "throwing_trash",
    "detailed_description": "A person throws a cup into the trash barrel.",
    "person_coun...
Found 10 interactions
Annotations saved to /content/drive/MyDrive/HRI-Annotation/annotations_output.xlsx

Detected Interactions:
1. 00:24-00:26: throwing_trash - A person throws a cup into the trash barrel.
2. 00:55-00:57: throwing_trash - A girl throws a juice box into the trash barrel.
3. 01:19-01:29: talking - Two people sitting at a table talk to each other about taking photos of the robot. One person takes a picture.
4. 02:20-02:31: talking - Two people sitting at a table talk about taking a picture of each other with the robot in the background. One person takes a picture.
5. 02:48-02:51: approaching - A woman and a child approach the trash barrel.
6. 03:04-03:07: talking - A woman walks towards the robot, talking to a person sitting next to the robot, before walkin