In [None]:
%pip install omegaconf
%pip install iopath
%pip install timm
%pip install decord
%pip install webdataset
%pip install einops
%pip install wandb
%pip install gradio
%pip install torchshow
%pip install sentencepiece

In [None]:
import argparse
import os
import random
import json
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torchshow as ts
from timechat.common.config import Config
from timechat.common.dist_utils import get_rank
from timechat.common.registry import registry
from timechat.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle, conv_llava_llama_2
import decord
import cv2
import time
import subprocess
from decord import VideoReader
from timechat.processors.video_processor import ToTHWC, ToUint8, load_video
decord.bridge.set_bridge('torch')

# imports modules for registration
from timechat.datasets.builders import *
from timechat.models import *
from timechat.processors import *
from timechat.runners import *
from timechat.tasks import *

import random as rnd
from transformers import StoppingCriteria, StoppingCriteriaList
from PIL import Image
import gradio as gr

In [None]:
def parse_args():
    parser = argparse.ArgumentParser(description="Demo")
    parser.add_argument("--cfg-path", default='eval_configs/timechat.yaml', help="path to configuration file.")
    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
    parser.add_argument("--num-beams", type=int, default=1)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--text-query", default="What is he doing?", help="question the video")
    parser.add_argument("--video-path", default='examples/vein.mp4', help="path to video file.")
    parser.add_argument(
        "--options",
        nargs="+",
        help="override some settings in the used config, the key-value pair "
        "in xxx=yyy format will be merged into config file (deprecate), "
        "change to --cfg-options instead.",
    )
    args = parser.parse_args(args=[])
    return args

In [None]:
print('Initializing Chat')
args = parse_args()
cfg = Config(args)

DIR="ckpt/timechat"
MODEL_DIR=f"{DIR}/timechat_7b.pth"

model_config = cfg.model_cfg
model_config.device_8bit = args.gpu_id
model_config.ckpt = MODEL_DIR
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:{}'.format(args.gpu_id))
model.eval()

vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

In [None]:
chat = Chat(model, vis_processor, device='cuda:{}'.format(args.gpu_id))
print('Initialization Finished')

In [None]:
%pip install ffmpeg-python

In [None]:
import re
import ffmpeg
import os

def extract_segment(input_file, output_file, start_time, end_time):
    (
        ffmpeg
        .input(input_file, ss=start_time)
        .output(output_file, to=end_time-start_time)
        .run()
    )

def parse_timestamps(llm_message):
    segments = []
    for match in re.finditer(r"(\d+\.\d+) - (\d+\.\d+) seconds, (.+?)(?=(?:\s*\d+\.\d+ -)|$)", llm_message):
        start_time = float(match.group(1))
        end_time = float(match.group(2))
        description = match.group(3).strip()
        segments.append((start_time, end_time, description))
    return segments

def extract_segments(input_file, llm_message):
    segments = parse_timestamps(llm_message)
    
    dvc_prompt_folder = "segmented_videos"
    highlight_prompt_folder = "segmented_videos_highlight"
    slc_prompt_folder = "segmented_videos_slc_2"
    summarization_prompt_folder = "segmented_videos_summ"

    output_folder = os.path.join(summarization_prompt_folder, os.path.splitext(os.path.basename(input_file))[0])
    os.makedirs(output_folder, exist_ok=True)
    summary_file = os.path.join(output_folder, "segmentsummary.txt")
    with open(summary_file, "w") as f:
        for idx, (start_time, end_time, description) in enumerate(segments):
            output_file = os.path.join(output_folder, f"output_{idx}.mp4")
            extract_segment(input_file, output_file, start_time, end_time)
            print(f"Segment {idx+1}: {description} (from {start_time} to {end_time}) extracted to {output_file}")
            f.write(f"Segment {idx+1}: {description} (from {start_time} to {end_time}) extracted to {output_file}\n")

input_folder = 'examples'
youcook_prompt_dvc = "You are given a cooking video from the YouCook2 dataset. Please watch the video and extract a maximum of 10 significant cooking steps. For each step, determine the starting and ending times and provide a concise description. The format should be: 'start time - end time, brief step description'. For example, ' 90 - 102 seconds, spread margarine on two slices of white bread'."
highlight_prompt = "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. The output format of each predicted event should be like: 'start - end seconds, event description'. A specific example is : ' 90 - 102 seconds, spread margarine on two slices of white bread in the video'."
slc_prompt = "Identify and mark the video segments corresponding to a series of actions or steps, specifying the timestamps and describing the steps."
summarization_prompt = "Generate a summarized version of the video, focusing on extracting key frames that best represent the overall narrative. The output should be a list of timestamps in seconds and their corresponding salient scores."
for filename in os.listdir(input_folder):
    if filename.endswith(".mp4"):
        video_path = os.path.join(input_folder, filename)

        video, _ = load_video(
            video_path=video_path,
            n_frms=32,
            sampling="uniform",
            return_msg=True
        )
        # video = vis_processor.transform(video)
        print(video.size())
        C, T, H, W = video.shape
        ts.show(video.transpose(0, 1))

        img_list = []
        chat_state = conv_llava_llama_2.copy()
        chat_state.system = "You are able to understand the visual content that the user provides. Follow the instructions carefully and explain your answers in detail."
        msg = chat.upload_video_without_audio(
            video_path=video_path,
            conv=chat_state,
            img_list=img_list,
            n_frms=96,
        )

        text_input = summarization_prompt
        print(text_input)

        chat.ask(text_input, chat_state)

        num_beams = args.num_beams
        temperature = args.temperature
        llm_message = chat.answer(conv=chat_state,
                                  img_list=img_list,
                                  num_beams=num_beams,
                                  temperature=temperature,
                                  max_new_tokens=300,
                                  max_length=2000)[0]

        print(llm_message)

        extract_segments(video_path, llm_message)