In [1]:
import cv2
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
import torchvision.transforms as transforms
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Trích xuất frames từ video
def extract_frames(video_path, frame_interval=10):
    video = cv2.VideoCapture(video_path)
    count_frames = 0

    if not os.path.exists('frames'):
        os.makedirs('frames')
    
    while video.isOpened():
        ret, frame = video.read()
        
        if not ret:
            break
        
        if count_frames % frame_interval == 0:
            # cv2.imshow('Output: ', frame)
            cv2.imwrite(f'frames/frame_{count_frames}.jpg', frame)
        count_frames += 1

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    video.release()
    cv2.destroyAllWindows()

In [3]:
# Mô tả từng khung hình bằng model Florence-2
def describe_frames(frames, florence_processor, florence_model):
    task_prompt = "<MORE_DETAILED_CAPTION>"
    frame_descriptions = []
    for frame in frames:
        image = Image.open(os.path.join('frames', frame))

        # Generate content   
        if image.mode != "RGB":
            image = image.convert("RGB")

        inputs = florence_processor(text=task_prompt, images=image, return_tensors="pt").to("cuda:0")
        generated_ids = florence_model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3
        )
        generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        parsed_answer = florence_processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
        frame_descriptions.append(parsed_answer)
    
    return frame_descriptions

Test multi processor

In [4]:
from multiprocessing import Pool


# Hàm mô tả một khung hình, tạo lại mô hình trong mỗi tiến trình
def describe_frame_single(frame, task_prompt):
    # Tạo lại processor và model trong mỗi tiến trình
    florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base-ft', trust_remote_code=True).eval().cuda()
    florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base-ft', trust_remote_code=True)

    image = Image.open(os.path.join('frames', frame))

    # Generate content   
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = florence_processor(text=task_prompt, images=image, return_tensors="pt").to("cuda:0")
    generated_ids = florence_model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = florence_processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    
    return parsed_answer

# Hàm đa tiến trình sử dụng Pool
def describe_frames_multiprocessing_pool(frames, num_processes=4):
    task_prompt = "<MORE_DETAILED_CAPTION>"
    
    with Pool(processes=num_processes) as pool:
        # Sử dụng Pool để xử lý các frame song song
        results = pool.starmap(describe_frame_single, [(frame, task_prompt) for frame in frames])
    
    return results


In [5]:
# Tổng hợp thông tin bằng BART 
def summarize_descriptions(descriptions, bart_model, bart_tokenizer):
    # Ghép nối các mô tả thành một đoạn văn bản
    combined_text = " ".join([desc["<MORE_DETAILED_CAPTION>"] for desc in descriptions])

    # Tokenizer và tạo input cho model BART
    inputs = bart_tokenizer.encode("summarize: " + combined_text, return_tensors="pt", max_length=1024, truncation=True)
    
    # Dự đoán tóm tắt
    summary_ids = bart_model.generate(inputs, max_length=200, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

In [6]:
# PIPELINE
def summarize_video_content(video_path, frame_rate=1):
    global florence_processor, florence_model, bart_model, bart_tokenizer

    # Xoá folder frame hiện tại
    if os.path.exists('frames'):
        for frame in os.listdir('frames'):
            os.remove(os.path.join('frames', frame))
    
    extract_frames(video_path, frame_rate)
    frames = os.listdir('frames')
    descriptions = describe_frames(frames, florence_processor, florence_model) 
    # descriptions = describe_frames_multiprocessing_pool(frames, num_processes=4)
    summary = summarize_descriptions(descriptions, bart_model, bart_tokenizer)
    return summary

In [7]:
# Load model
florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base-ft', trust_remote_code=True).eval().cuda()
florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base-ft', trust_remote_code=True)
bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [10]:
video_path = r'videos\\video_2.mp4'
summary = summarize_video_content(video_path, frame_rate=10)

In [11]:
print(summary)

A green parrot with a red beak is sitting on a branch of a tree. There are green leaves on the branches of the tree. The parrot has a blue body and a red head. The background of the image is blurred.
