In [1]:
!pip install -q av transformers torchvision pillow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.7/38.7 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q -U bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
import torch
import av
from PIL import Image
import numpy as np
from torchvision import transforms
import time
import cv2 # Add the cv2 import here

In [4]:
# Define paths
output_dir = "/kaggle/input/finetuning-qna-next/fine-tuned-llava-next-video"
offload_dir = "/kaggle/working/offload_dir"

# Create offload directory
os.makedirs(offload_dir, exist_ok=True)

# Verify directory contents
print("Contents of the directory:", os.listdir(output_dir))

# Load the processor
processor = LlavaNextVideoProcessor.from_pretrained(output_dir)

# Load the model with 4-bit quantization
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    output_dir,
    device_map="auto",
    offload_folder=offload_dir,
    load_in_4bit=True,  # Enable 4-bit quantization
    torch_dtype=torch.float16,  # Use mixed precision
)

Contents of the directory: ['adapter_model.safetensors', 'preprocessor_config.json', 'training_args.bin', 'adapter_config.json', 'README.md', 'tokenizer.json', 'tokenizer_config.json', 'chat_template.json', 'processor_config.json', 'special_tokens_map.json', 'tokenizer.model', 'added_tokens.json']


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [5]:
# Compile the model
model = torch.compile(model) # <-- Add this

# Inspect the device map
print("Device map:", model.hf_device_map)

Device map: {'': 0}


In [6]:
# Function to read video frames using cv2
def read_video_opencv(video_path, indices, target_size=(224, 224)):
    frames = []
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None

    for i in range(max(indices) + 1):
        ret, frame = cap.read()
        if not ret:
            break  # Or handle missing frames
        if i in indices:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, target_size)
            frames.append(frame)
    cap.release()
    return np.stack(frames)

In [7]:
# Function to format the conversation
def format_conversation(conversation):
    formatted_text = ""
    for turn in conversation:
        role = turn["role"]
        content = turn["content"]
        if role == "user":
            for item in content:
                if item["type"] == "text":
                    formatted_text += f"User: {item['text']}\n"
                elif item["type"] == "video":
                    formatted_text += "<video>\n"
        elif role == "assistant":
            formatted_text += f"Assistant: {content}\n"
    return formatted_text

In [8]:
# Function to perform model inference
def perform_inference(video_path, question):
    # Load a video and sample frames
    start_time = time.time()
    container = av.open(video_path)
    total_frames = container.streams.video[0].frames
    indices = np.linspace(0, total_frames - 1, num=2, dtype=int)  # Sample 2 frames
    container.close()
    video_frames = read_video_opencv(video_path, indices)
    print("Time taken for Video loading:", time.time() - start_time)

    # Prepare the conversation prompt
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "video"},
            ],
        },
    ]

    # Format the conversation
    formatted_text = format_conversation(conversation)

    # Tokenize the input
    inputs = processor(
        text=formatted_text,
        videos=video_frames,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )

    # Move inputs to the same device as the model
    inputs = {key: val.to(model.device) for key, val in inputs.items()}

    # Generate predictions
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, num_beams=1, max_new_tokens=100)  # Greedy search and reduced output length

    print("Time Taken for Model Inference:", time.time() - start_time)
    # Decode the generated text
    generated_text = processor.decode(outputs[0], skip_special_tokens=True)
    print("Generated Text:", generated_text)

In [9]:
# Example usage for one video and one question
video_path = "/kaggle/input/videocap/processed_data/processed_data/Dhime/-Amazing Newari Dance _ Patan Darbar Square _ Dhime Baja--Amrita Shrestha--mhlCZTGyl1E_segment1_clip_1.mp4"
question = "What dance is being performed on the video?"

# Perform inference
perform_inference(video_path, question)

Time taken for Video loading: 1.2496576309204102




Time Taken for Model Inference: 10.963911056518555
Generated Text: User: What dance is being performed on the video?



The video shows a group of people performing a dance in front of a large crowd. The dance involves a lot of movement and energy, with the dancers moving around the stage and interacting with each other. The audience is seen cheering and clapping throughout the performance. The video captures the excitement and energy of the event, with the crowd's reactions adding to the overall atmosphere. The dancers are dressed in colorful costumes, and the stage is decor


In [10]:
video_path = "/kaggle/input/videocap/processed_data/processed_data/Bhairab Dance/Best Bhairab Dance 2022 _  Sandhikharka Arghakhanchi लाखे नाच 2079 _ Newari Culture--Nepal Vlogs--UC-bXEel_J6IgXuecrewPM7A--rOR8D8PsA5M_segment4_clip_1.mp4"
question = "Which culture does this video dance belongs to?"

perform_inference(video_path, question)

Time taken for Video loading: 0.6944847106933594
Time Taken for Model Inference: 9.984693050384521
Generated Text: User: Which culture does this video dance belongs to?



The video shows a group of people performing a traditional dance in front of a crowd. The dance involves a lot of movement and energy, with the dancers wearing colorful costumes and performing elaborate makeup. The dance is accompanied by music and appears to be a part of a larger cultural event or festival. The video does not provide any specific information about the cultural background of the dance, but it is clear that it is a traditional and culturally significant performance.


In [11]:
video_path = "/kaggle/input/videocap/processed_data/processed_data/Dhime/-DHIME DANCE AND MUSIC _ Culture Of Newar--Sam Audio Video--S4PcERz4Eio_segment2_clip_1.mp4"
question = "What are the instruments that are being playing in the video?"

perform_inference(video_path,question)

Time taken for Video loading: 0.8133163452148438
Time Taken for Model Inference: 5.645225763320923
Generated Text: User: What are the instruments that are being playing in the video?



The instruments being played in the video are the tabla and the harmonium. The tabla is a percussion instrument that is played with the hands, while the harmonium is a stringed instrument that is played with a bow.
