In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from PIL import Image
from IPython.display import Markdown, clear_output, display, Video
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForCausalLM, AutoProcessor, AutoModel, AutoImageProcessor

model_path = "DAMO-NLP-SG/VideoLLaMA3-7B"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.bfloat16, 
    low_cpu_mem_usage=True,
    attn_implementation="flash_attention_2"
)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

In [2]:
import cv2
import matplotlib.pyplot as plt
import numpy as np

def sample_frames_from_video(video_path, num_frames=64):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    step = total_frames // num_frames
    
    frames = []
    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)

        ret, frame = cap.read()
        if not ret:
            break

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame_rgb)
    
    cap.release()
    return frames

def display_frames_grid(frames, grid_size=(8, 8)):
    fig, axes = plt.subplots(grid_size[0], grid_size[1], figsize=(16, 10))
    
    for i, ax in enumerate(axes.flat):
        if i < len(frames):
            ax.imshow(frames[i])
            ax.axis('off')
        else:
            ax.axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# video_path = '/ssd3/cheng/tabletennis/video1.mp4'
# display(Video(video_path, width=576, height=1024, embed=True))

## Default

In [None]:

"""
conversation = [
    {        
        "role": "user",
        "content": [
            {
                "type": "video", 
                "video": {"video_path": video_path, "fps": 1, "max_frames": 180}
            },
            {
                "type": "text", 
                "text": "Describe the landing point of a ping pong match "
            },
        ]
    }
]


# Single-turn conversation
inputs = processor(conversation=conversation, return_tensors="pt")
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
if "pixel_values" in inputs:
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)

output_ids = model.generate(**inputs, max_new_tokens=256)
response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
display(Markdown(response))
"""

## Add limition

In [None]:
"""
import json

def generate_caption(video_path, video_height, video_width):
    # You can display the video for debugging if needed
    # display(Video(video_path, width=video_width, height=video_height, embed=True))
    
    conversation = [
        {        
            "role": "user",
            "content": [
                {
                    "type": "video", 
                    "video": {"video_path": video_path, "fps": 1, "max_frames": 180}
                },
                {
                    "type": "text", 
                    "text": "Describe the video in detail."
                },
            ]
        }
    ]

    # Single-turn conversation
    inputs = processor(conversation=conversation, return_tensors="pt")
    inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
    if "pixel_values" in inputs:
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)

    output_ids = model.generate(**inputs, max_new_tokens=256)
    response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    
    return response

# Load JSON data
input_file = "/ssd3/chunlin/smp_video_2025/test_data_combined/merged_data.json"  # Update with your actual JSON file path
with open(input_file, "r") as f:
    json_data = f.readlines()

# Parse each line as a separate JSON object
video_data = []
for line in json_data:
    if line.strip():  # Skip empty lines
        video_data.append(json.loads(line.strip()))

# Initialize list to store caption results
caption_results = []

# Process each video
for i, data in enumerate(video_data):
    print(f"Processing video {i+1}/{len(video_data)}: {data['vid']}")
    
    # Extract necessary information
    pid = data["pid"]
    uid = data["uid"]
    vid = data["vid"]
    video_path = data["video_path"]
    video_height = data["video_height"]
    video_width = data["video_width"]
    
    try:
        # Generate caption
        caption = generate_caption(video_path, video_height, video_width)
        
        # Create result object
        result = {
            "pid": pid,
            "uid": uid,
            "vid": vid,
            "caption": caption
        }
        
        # Add to results
        caption_results.append(result)
        
        # Print progress
        print(f"Caption generated for {vid}")
    except Exception as e:
        print(f"Error processing video {vid}: {str(e)}")

# Save results to caption.json
output_file = "caption.json"
with open(output_file, "w") as f:
    for result in caption_results:
        f.write(json.dumps(result) + "\n")

print(f"Captions saved to {output_file}")
"""

## Main

#### prompt
Q1, Q2: "Describe the video in detail."(Q2 use limitation)  

Q3: "Based on the content of this TikTok video, determine its category from the following list: Dance, Comedy, Lip Sync, Tutorial, Beauty & Fashion, Fitness, Food & Drink, Pets & Animals, Vlogging, Challenges, Memes, Technology, Travel, Motivation & Inspiration, Art & Creativity, Sports, Music, Social Issues, Unboxing, Pranks, and Others. Provide a short explanation for your classification."  

Q4: "Provide a short description for this vedio."

Q5: "Analyze the video and provide detailed information about the subtitles or captions, including:
- Whether subtitles are present or not.
- The language(s) used in the subtitles.
- The style of subtitles (e.g., font size, color, position, animation).
- How subtitles contribute to the viewer's understanding and engagement.
- Any noticeable timing or synchronization features with the spoken content.
Please respond with concise bullet points."

Q6: "Based on the content, style, and context of the video, identify the most likely target audience by describing:
- Age group(s) that would be interested.
- Interests or hobbies relevant to the video.
- Geographic or cultural background if identifiable.
- Reasons or clues from the video that support your audience inference.
Please provide your answer as brief bullet points."

In [None]:
import json
import time
import random


def generate_caption(video_path, retry_count=3):
    # Check if file exists
    if not os.path.exists(video_path):
        return f"ERROR: Video file not found: {video_path}"
    
    for attempt in range(retry_count):
        try:
            conversation = [
                {        
                    "role": "user",
                    "content": [
                        {
                            "type": "video", 
                            "video": {"video_path": video_path, "fps": 1, "max_frames": 180}  # Reduced from 180
                        },
                        {
                            "type": "text", 
                            "text": "Analyze the video and provide detailed information about the subtitles or captions, including:- Whether subtitles are present or not.- The language(s) used in the subtitles.- The style of subtitles (e.g., font size, color, position, animation).- How subtitles contribute to the viewer's understanding and engagement.- Any noticeable timing or synchronization features with the spoken content.Please respond with concise bullet points."
                        },
                    ]
                }
            ]

            # Free up memory before processing
            torch.cuda.empty_cache()
            
            # Process input
            inputs = processor(conversation=conversation, return_tensors="pt")
            inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
            if "pixel_values" in inputs:
                inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16).to("cuda")

            # Generate caption
            output_ids = model.generate(**inputs, max_new_tokens=256)
            response = processor.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
            
            # Check if response is just repeating the prompt
            if "Describe the video in detail" in response and len(response) < 50:
                if attempt < retry_count - 1:
                    print(f"Attempt {attempt+1}: Got prompt repetition, retrying...")
                    time.sleep(random.uniform(3, 6))  # Random delay between retries
                    continue
                else:
                    return "ERROR: Model only returned the prompt after multiple attempts"
            
            return response
            
        except Exception as e:
            if attempt < retry_count - 1:
                print(f"Attempt {attempt+1} failed with error: {str(e)}, retrying...")
                time.sleep(random.uniform(2, 5))  # Random delay between retries
            else:
                return f"ERROR: Failed after {retry_count} attempts. Last error: {str(e)}"
    
    return "ERROR: Failed to generate caption after multiple attempts"


# Load JSON data
input_file = "../processed_data/train_cleaned_data.json"  # Update with your actual JSON file path
output_file = "caption_test_try.json"
checkpoint_file = "caption_checkpoint.json"  # To save progress periodically

# Check if there's a checkpoint file to resume from
processed_videos = set()
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, "r") as f:
        for line in f:
            if line.strip():
                try:
                    data = json.loads(line.strip())
                    processed_videos.add(data["vid"])
                except:
                    continue
    print(f"Resuming from checkpoint, {len(processed_videos)} videos already processed")

# Parse each line as a separate JSON object
with open(input_file, "r") as f:
    json_data = f.readlines()

video_data = []
for line in json_data:
    if line.strip():  # Skip empty lines
        try:
            data = json.loads(line.strip())
            if data["vid"] not in processed_videos:
                video_data.append(data)
        except:
            continue

print(f"Found {len(video_data)} videos to process")

# Process each video with random delays to avoid overwhelming the system
for i, data in enumerate(video_data):
    try:
        pid = data["pid"]
        uid = data["uid"]
        vid = data["vid"]
        video_path = data["video_path"]
        
        print(f"Processing {i+1}/{len(video_data)}: {vid}")
        
        # Generate caption
        caption = generate_caption(video_path)
        
        # Create result object
        result = {
            "pid": pid,
            "uid": uid,
            "vid": vid,
            "caption": caption
        }
        
        # Append to output file immediately to save progress
        with open(output_file, "a") as f:
            f.write(json.dumps(result) + "\n")
        
        # Also save to checkpoint
        with open(checkpoint_file, "a") as f:
            f.write(json.dumps(result) + "\n")
        
        # Add random delay between processing videos
        delay = random.uniform(1, 3)  # 1-3 seconds delay
        print(f"Waiting {delay:.2f} seconds before next video...")
        time.sleep(delay)
        
        # Every 100 videos, do a longer cooldown to allow system recovery
        if (i + 1) % 100 == 0:
            print(f"Completed {i+1} videos. Taking a longer break...")
            time.sleep(random.uniform(15, 30))  # 15-30 seconds cooldown
            torch.cuda.empty_cache()  # Clear GPU cache
        
    except Exception as e:
        print(f"Error processing video data: {str(e)}")
        # Still save the error to maintain the record
        error_result = {
            "pid": data.get("pid", "unknown"),
            "uid": data.get("uid", "unknown"),
            "vid": data.get("vid", "unknown"),
            "caption": f"ERROR: Processing failed with error: {str(e)}"
        }
        with open(output_file, "a") as f:
            f.write(json.dumps(error_result) + "\n")
        with open(checkpoint_file, "a") as f:
            f.write(json.dumps(error_result) + "\n")

print(f"Caption generation completed. Results saved to {output_file}")