In [1]:
import os
os.chdir('/your_local_path_to/StreamForest')

In [2]:
import argparse
import torch

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import process_anyres_image,tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video_nopad

import json
import os
import math


from transformers import AutoConfig
from llava.video_utils import VIDEO_READER_FUNCS


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


def parse_args():
    """
    Parse command-line arguments.
    """
    parser = argparse.ArgumentParser()

    # Define the command-line arguments
#     parser.add_argument("--video_path", help="Path to the video files.", default="/mnt/petrelfs/zengxiangyu/OpenSource/Backup/fg-videochat/download/demo_video/legendof1900.mp4")
#     parser.add_argument("--prompt", default="describe this video in detail.", type=str) 
    
    
    parser.add_argument("--output_dir", default="./work_dirs/video_demo/", help="Directory to save the model results JSON.")
    parser.add_argument("--output_name",default="pred" , help="Name of the file for storing results JSON.")
    parser.add_argument("--model-path", type=str, default="/your_local_path_to/StreamForest/ckpt/StreamForest-Qwen2-7B_Siglip")
    parser.add_argument("--inference_device", type=str, default="cuda:0")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--conv-mode", type=str, default="qwen_2")
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--max_num_frames", type=int, default=4096)
    parser.add_argument("--load_8bit",  type=lambda x: (str(x).lower() == 'true'), default=False)
    parser.add_argument("--force_sample", type=lambda x: (str(x).lower() == 'true'), default=False)
    parser.add_argument("--time_msg", type=str, default="")
    parser.add_argument("--llm_type", type=str, default="")
    parser.add_argument("--attn_implementation", type=str, default="flash_attention_2")
    parser.add_argument("--use_hd", type=bool, default=False)
    args = parser.parse_args(args=[])
    return args

args = parse_args()




[2025-09-26 01:08:10,972] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
llava_model_args = {
}
# overwrite_config = {}
# mm_projector_type=None



# mm_projector_type="tome196_memory_1k"


# if mm_projector_type is not None and mm_projector_type!="":
#     print("<<< warning >>> replace projector with: ", mm_projector_type)
#     overwrite_config["mm_projector_type"] = mm_projector_type
    
    
# llava_model_args["overwrite_config"] = overwrite_config

In [4]:
# Initialize the model
model_name = get_model_name_from_path(args.model_path)
# Set model configuration parameters if they exist
model_name += args.llm_type
cfg_pretrained = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True,)
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, load_8bit=args.load_8bit, multimodal=True, trust_remote_code=True, attn_implementation=args.attn_implementation, **llava_model_args)
model.to(torch.float16)

print("Model tensor type: ", model.dtype)

# import pdb;pdb.set_trace()
if getattr(model.config, "force_sample", None) is not None:
    args.force_sample = model.config.force_sample
else:
    args.force_sample = False

# import pdb;pdb.set_trace()

if getattr(model.config, "add_time_instruction", None) is not None:
    args.add_time_instruction = model.config.add_time_instruction
else:
    args.add_time_instruction = False

# Create the output directory if it doesn't exist
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

Loaded LLaVA model: /your_local_path_to/StreamForest/ckpt/StreamForest-Qwen2-7B_Siglip


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type qwen2 to instantiate a model of type llava_qwen. This is not supported for all configurations of models and can yield errors.


Loading vision tower: /mnt/petrelfs/share/videointern/siglip/siglip-so400m-patch14-384
<<<mm_projector time_pos_embedding_window>>> :  512
pos_emb shape torch.Size([512, 1, 1, 1152])
<<< self.sim_weight_g:  0.4 >>>
<<< self.time_weight_a:  0.2 >>>
<<< self.merge_weight_b:  0.4 >>>




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model Class: LlavaQwenForCausalLM
Model tensor type:  torch.float16


In [5]:
def load_video(video_path, args, question_time=0):
    if os.path.isdir(video_path):
        media_dict = {'video_read_type': 'img'}
    else:
        media_dict = {'video_read_type': 'decord'}

    if type(video_path) != str:
        assert len(video_path) == 1, video_path
        video_path = video_path[0]

    if question_time>0:
        clip = [0, question_time]
    else:
        clip = None
    
    if 's3://' in video_path:
        from petrel_client.client import Client
        client = Client(conf_path='~/petreloss.conf')
    else:
        client = None
    
    max_frames_num = args.max_num_frames
    
    if 'fps' in media_dict:
        frames, frame_indices, fps, duration = VIDEO_READER_FUNCS[media_dict['video_read_type']](video_path=video_path, num_frames=max_frames_num, sample='dynamic_fps1', fix_start=None, min_num_frames=4, max_num_frames=max_frames_num, client=client, clip=clip, local_num_frames=1, fps=media_dict['fps'])
    else:
        frames, frame_indices, fps, duration = VIDEO_READER_FUNCS[media_dict['video_read_type']](video_path=video_path, num_frames=max_frames_num, sample='dynamic_fps1', fix_start=None, min_num_frames=4, max_num_frames=max_frames_num, client=client, clip=clip, local_num_frames=1)
    sec = [str(round(f / fps, 1)) for f in frame_indices]

    if args.time_msg is not None and sec is not None:
        if args.time_msg == 'short':
            msg = f"\nThe video lasts for {duration:.2f} seconds, and {len(sec)} frames are uniformly sampled from it. "
        elif args.time_msg == 'short_online':
            msg = f"\nThe video segment contains {len(sec)} frames sampled from the past {(float(sec[-1])-float(sec[0])):.1f} seconds ago up to the present moment. "
        elif args.time_msg == 'short_online_v2':
            msg = f"\nThe video contains {len(sec)} frames sampled from the past {(float(sec[-1])-float(sec[0])):.1f} seconds ago ({float(sec[0]):.1f}s of the entire video) up to the present moment ({float(sec[-1]):.1f}s of the entire video). "
        elif args.time_msg == 'short_online_per_frame':
            msg_overall = f"\nThe video contains {len(sec)} frames sampled from the past {(float(sec[-1])-float(sec[0])):.1f} seconds ago ({float(sec[0]):.1f}s of the entire video) up to the present moment ({float(sec[-1]):.1f}s of the entire video). "
            msg_per_frame =  ''.join([f"[TIME_MSG_PER_FRAME]{sec_time} seconds" for sec_time in sec])+"[TIME_MSG_PER_FRAME]"
            msg = msg_overall + msg_per_frame
        else:
            msg = f"\nThe video lasts for {duration:.2f} seconds, and {len(sec)} frames are uniformly sampled at {', '.join(sec)} seconds. "
    else:
        msg = ""

    return frames, msg

In [6]:
output_name = args.output_name
answers_file = os.path.join(args.output_dir, f"{output_name}.json")
ans_file = open(answers_file, "w")
import time
import torch.profiler

def run_inference(args, video_path, question, question_time=0):
    """
    Run inference on a demo video using VideoChat-Next model.

    Args:
        args: Command-line arguments.
    """



    if hasattr(model.config, "frame_aspect_ratio"):
        frame_aspect_ratio = model.config.frame_aspect_ratio
    else:
        frame_aspect_ratio = ""

    # import pdb;pdb.set_trace()

    print("video_path:", video_path)
    sample_set = {}
    sample_set["Q"] = question
    sample_set["video_name"] = video_path


    # Check if the video exists
    # if os.path.exists(video_path) :
    assert 's3://' in video_path or os.path.exists(video_path), video_path

    frames, time_msg  = load_video(video_path, args, question_time)
    print("len(frames):", len(frames))
    image_sizes = [frames[0].shape[:2]]
    print("image_sizes:", image_sizes)


    frames = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(dtype=model.dtype).cuda(args.inference_device)

    print("input frames:", frames.shape)
    
    video = [frames]

    # try:
    # Run inference on the video and add the output to the list
    qs = question
    if args.time_msg != "":
        qs = f'{time_msg.strip()}\n{qs}'
        
    if model.config.mm_use_im_start_end:
        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs
    else:
        qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

    
    print(f"Question: {qs}")
    conv = conv_templates[args.conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda(args.inference_device)
    if tokenizer.pad_token_id is None:
        if "qwen" in tokenizer.name_or_path.lower():
            print("Setting pad token to bos token for qwen model.")
            tokenizer.pad_token_id = 151643

    attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda(args.inference_device)

    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

    start_time = time.time()
    
    with torch.inference_mode():
        if "mistral" not in cfg_pretrained._name_or_path.lower():
            output_ids = model.generate_online(
                inputs=input_ids,
                images=video,
                attention_mask=attention_masks,
                modalities=["video"],
                image_sizes=image_sizes,
                do_sample=False,
                temperature=0.0,
                max_new_tokens=1,
                num_beams=1,
                use_cache=True,
                stopping_criteria=[stopping_criteria]
            )

        else:
            output_ids = model.generate(inputs=input_ids, images=video, attention_mask=attention_masks, modalities="video", do_sample=False, temperature=0.0, max_new_tokens=1024, top_p=0.1, num_beams=1, use_cache=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"\n <<< Total inference time: {elapsed_time:.3f} seconds >>> \n")
    print(f"\n <<< Total average speed: {video[0].shape[0]/elapsed_time:.3f} fps >>> \n")
    
    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
    
    print(f"Question: {prompt}\n")
    print(f"Response: {outputs}\n")

    # import pdb;pdb.set_trace()
    if "mistral" not in cfg_pretrained._name_or_path.lower():
        if outputs.endswith(stop_str):
            outputs = outputs[: -len(stop_str)]

    outputs = outputs.strip()

    sample_set["pred"] = outputs
    ans_file.write(json.dumps(sample_set, ensure_ascii=False) + "\n")
    ans_file.flush()

In [7]:
# StreamForest
video_path = "demo/video/Forrest_Gump.mp4"
question= "Please describe the content of the video in detail."
# question= "Is there a man in black in the picture?"

question_time=600
run_inference(args,video_path,question,question_time)


video_path: demo/video/Forrest_Gump.mp4
len(frames): 601
image_sizes: [(320, 752)]
input frames: torch.Size([601, 3, 384, 384])
Question: <image>
Please describe the content of the video in detail.





 <<< LLM inference time: 0.157 seconds >>> 
result at frame 0 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.028 seconds >>> 
result at frame 1 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.028 seconds >>> 
result at frame 2 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.046 seconds >>> 
result at frame 3 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.032 seconds >>> 
result at frame 4 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.032 seconds >>> 
result at frame 5 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.032 seconds >>> 
result at frame 6 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.031 seconds >>> 
result at frame 7 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.032 seconds >>> 
result at frame 8 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.032 seconds >>> 
result at frame 9 : tensor([[785]], device='cuda:0')

 <<< LLM inference 


 <<< LLM inference time: 0.087 seconds >>> 
result at frame 85 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.089 seconds >>> 
result at frame 86 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 87 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 88 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 89 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.087 seconds >>> 
result at frame 90 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 91 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 92 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 93 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 94 : tensor([[785]], device='cuda:0')

 <<< LLM 


 <<< LLM inference time: 0.093 seconds >>> 
result at frame 169 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 170 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 171 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 172 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 173 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.087 seconds >>> 
result at frame 174 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 175 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.087 seconds >>> 
result at frame 176 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 177 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 178 : tensor([[785]], device='cuda:0')



 <<< LLM inference time: 0.089 seconds >>> 
result at frame 253 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.087 seconds >>> 
result at frame 254 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.085 seconds >>> 
result at frame 255 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.086 seconds >>> 
result at frame 256 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.087 seconds >>> 
result at frame 257 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 258 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 259 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.086 seconds >>> 
result at frame 260 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.090 seconds >>> 
result at frame 261 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 262 : tensor([[785]], device='cuda:0')



 <<< LLM inference time: 0.086 seconds >>> 
result at frame 337 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.086 seconds >>> 
result at frame 338 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 339 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 340 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.086 seconds >>> 
result at frame 341 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 342 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 343 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.086 seconds >>> 
result at frame 344 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 345 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 346 : tensor([[785]], device='cuda:0')



 <<< LLM inference time: 0.088 seconds >>> 
result at frame 421 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 422 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 423 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 424 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.089 seconds >>> 
result at frame 425 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 426 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.089 seconds >>> 
result at frame 427 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 428 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 429 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 430 : tensor([[785]], device='cuda:0')



 <<< LLM inference time: 0.086 seconds >>> 
result at frame 505 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 506 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 507 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.086 seconds >>> 
result at frame 508 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 509 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.086 seconds >>> 
result at frame 510 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.087 seconds >>> 
result at frame 511 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 512 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 513 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 514 : tensor([[785]], device='cuda:0')



 <<< LLM inference time: 0.088 seconds >>> 
result at frame 589 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 590 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 591 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 592 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 593 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 594 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.087 seconds >>> 
result at frame 595 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.088 seconds >>> 
result at frame 596 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.089 seconds >>> 
result at frame 597 : tensor([[785]], device='cuda:0')

 <<< LLM inference time: 0.093 seconds >>> 
result at frame 598 : tensor([[785]], device='cuda:0')
