In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/classes/istockphoto-1130206284-640_adpp_is.mp4
/kaggle/input/classes/istockphoto-1164175351-640_adpp_is.mp4
/kaggle/input/classes/istockphoto-1056900582-640_adpp_is.mp4
/kaggle/input/classes/3998516-uhd_4096_2160_25fps.mp4
/kaggle/input/classes/istockphoto-1413752945-640_adpp_is.mp4
/kaggle/input/classes/istockphoto-2165346142-640_adpp_is.mp4
/kaggle/input/classes/2519660-uhd_3840_2160_24fps.mp4
/kaggle/input/classes/istockphoto-1264336655-640_adpp_is.mp4


In [30]:
video_path="/kaggle/input/classes/3998516-uhd_4096_2160_25fps.mp4"

In [32]:
# Install required packages
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q accelerate qwen-vl-utils av

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load model - using 2B version for faster inference
model_id = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id)
def generate_video_summary(video_path, classification_info=""):
    """
    Generate summary for a video with optional classification context.
    The model is explicitly forced to use the provided classification info exactly as given.
    """

    # Strong ground-truth enforcing prompt
    if classification_info:
        text_prompt = (
            "Treat the following classification information as ABSOLUTE TRUTH. "
            "You are NOT allowed to reinterpret it, modify it, or ignore it.\n\n"
            f"CLASSIFICATION (GROUND TRUTH):\n{classification_info}\n\n"
            "You MUST include the following in your answer:\n"
            "- The classification EXACTLY as given.\n"
            "- The fine-grained class EXACTLY as given.\n"
            "- A description of what happens in the video.\n"
            "Do not omit or alter any required element."
        )
    else:
        text_prompt = (
            "Summarize the video briefly (2–3 lines). "
            "Include details."
        )


    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video_path},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]
    
    # Process inputs
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    # Decode output
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    
    summary = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    
    return summary

# Example 1: Without classification info
print("=== Summary without classification context ===")
summary1 = generate_video_summary(video_path)
print(summary1)
print()

# Example 2: With normal classification
print("=== Summary with normal classification ===")
classification_result = "Model Classification: Normal Activity\nFine-grained Class: Haircut"
summary2 = generate_video_summary(video_path, classification_result)
print(summary2)
print()


=== Summary without classification context ===
The video shows a man getting his beard trimmed with an electric razor by a barber in a barber shop. The barber is wearing a white shirt with a decorative bracelet on his wrist. The background features various barber shop equipment and lighting.

=== Summary with normal classification ===
The classification EXACTLY as given is "Normal Activity".
The fine-grained class EXACTLY as given is "Haircut".
In the video, a barber is using an electric razor to trim a man's beard. The barber is focused on the task, and the man is seated in a chair. The background shows various barber shop equipment and tools.



In [33]:
def generate_video_summary(video_path, classification_info=""):
    """
    Generate summary for a video with optional classification context.
    The model is explicitly forced to use the provided classification info exactly as given.
    """

    
    text_prompt = (
        "You MUST treat the following classification results as GROUND TRUTH. "
        "You MUST extract them EXACTLY and place them in the correct fields below.\n\n"
        f"{classification_info}\n\n"
        "Extract the following EXACT fields:\n"
        "- 'Model Classification' → goes into the 'Classification' field\n"
        "- 'Fine-grained Class' → goes into the 'Fine-grained Class' field\n\n"
        "Now output in EXACTLY this format:\n\n"
        "Classification: <insert Model Classification>\n"
        "Fine-grained Class: <insert Fine-grained Class>\n"
        "Video Description: <describe what happens in the video>\n"
        "Do NOT swap the fields. Do NOT rewrite the labels. Do NOT change wording."
    )


    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video_path},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]
    
    # Process inputs
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    # Decode output
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    
    summary = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    
    return summary


# Example 2: With normal classification
print("=== Summary with normal classification ===")
classification_result = "Model Classification: Normal Activity\nFine-grained Class: Haircut"
summary2 = generate_video_summary(video_path, classification_result)
print(summary2)
print()

# Example 3: With abnormal classification
# print("=== Summary with abnormal classification ===")
# classification_result = "Model Classification: Abnormal Activity\nAbnormal Type: Burglary"
# summary3 = generate_video_summary(video_path, classification_result)
# print(summary3)

=== Summary with normal classification ===
Classification: Normal Activity
Fine-grained Class: Haircut
Video Description: A man is getting his hair cut by a barber in a barber shop. The barber is using a hair clipper to trim the man's hair and beard.



In [36]:

def generate_video_summary(video_path, classification_info=""):
    """
    Generate summary for a video with optional classification context.
    The model is explicitly forced to use the provided classification info exactly as given.
    """

   
    text_prompt = (
        "Follow the classification information EXACTLY as provided. "
        "You may comment on whether the video content agrees with it, but NEVER alter the classification text.\n\n"
        f"Classification Provided:\n{classification_info}\n\n"
        "Your output must include:\n"
        "1. The provided classification as given.\n"
        "2. A clear description of the video's content.\n"
    )



    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video_path},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]
    
    # Process inputs
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    # Decode output
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    
    summary = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    
    return summary



# Example 2: With normal classification
print("=== Summary with normal classification ===")
classification_result = "Model Classification: Normal Activity\nFine-grained Class: Haircut"
summary2 = generate_video_summary(video_path, classification_result)
print(summary2)
print()

# Example 3: With abnormal classification
# print("=== Summary with abnormal classification ===")
# classification_result = "Model Classification: Abnormal Activity\nAbnormal Type: Burglary"
# summary3 = generate_video_summary(video_path, classification_result)
# print(summary3)

=== Summary with normal classification ===
The video shows a close-up of a barber using an electric razor to trim a man's beard. The background is blurred, focusing attention on the barber's hands and the man's hair and beard. The setting appears to be a barber shop or a similar hair salon.



In [17]:

# def generate_video_summary(video_path, classification_info=""):
#     """
#     Generate summary for a video with optional classification context.
#     The model is explicitly forced to use the provided classification info exactly as given.
#     """

    
#     text_prompt = (
#         "Incorporate the following classification into your summary. "
#         "Do not change the labels, but you may expand on them.\n\n"
#         f"Classification:\n{classification_info}\n\n"
#         "Explain what is happening in the video and indicate whether the visual content "
#         "supports or contradicts the classification."
#     )




#     # Prepare messages
#     messages = [
#         {
#             "role": "user",
#             "content": [
#                 {"type": "video", "video": video_path},
#                 {"type": "text", "text": text_prompt},
#             ],
#         }
#     ]
    
#     # Process inputs
#     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     image_inputs, video_inputs = process_vision_info(messages)
    
#     inputs = processor(
#         text=[text],
#         images=image_inputs,
#         videos=video_inputs,
#         padding=True,
#         return_tensors="pt"
#     ).to(model.device)
    
#     # Generate response
#     with torch.no_grad():
#         output_ids = model.generate(
#             **inputs,
#             max_new_tokens=256,
#             do_sample=True,
#             temperature=0.7,
#             top_p=0.9
#         )
    
#     # Decode output
#     generated_ids = [
#         output_ids[len(input_ids):]
#         for input_ids, output_ids in zip(inputs.input_ids, output_ids)
#     ]
    
#     summary = processor.batch_decode(
#         generated_ids,
#         skip_special_tokens=True,
#         clean_up_tokenization_spaces=True
#     )[0]
    
#     return summary



# # Example 2: With normal classification
# print("=== Summary with normal classification ===")
# classification_result = "Model Classification: Normal Activity\nFine-grained Class: Haircut"
# summary2 = generate_video_summary(video_path, classification_result)
# print(summary2)
# print()

# # Example 3: With abnormal classification
# # print("=== Summary with abnormal classification ===")
# # classification_result = "Model Classification: Abnormal Activity\nAbnormal Type: Burglary"
# # summary3 = generate_video_summary(video_path, classification_result)
# # print(summary3)

In [35]:

def generate_video_summary(video_path, classification_info=""):
    """
    Generate summary for a video with optional classification context.
    The model is explicitly forced to use the provided classification info exactly as given.
    """

    text_prompt = (
        "You MUST copy the classification information EXACTLY as given, without any changes.\n\n"
        f"{classification_info}\n\n"
        "Your response MUST contain ONLY the following 3 sections:\n\n"
        "1. CLASSIFICATION (copy EXACTLY from above)\n"
        "2. VIDEO SUMMARY (describe the video)\n"
        "Do NOT change section titles. Do NOT rewrite classification. Follow the format EXACTLY."
    )
    





    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video_path},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]
    
    # Process inputs
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    # Decode output
    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    
    summary = processor.batch_decode(
        generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )[0]
    
    return summary



# Example 2: With normal classification
print("=== Summary with normal classification ===")
classification_result = "Model Classification: Normal Activity\nFine-grained Class: Haircut"
summary2 = generate_video_summary(video_path, classification_result)
print(summary2)
print()



=== Summary with normal classification ===
1. CLASSIFICATION: Normal Activity
2. VIDEO SUMMARY: The video captures a close-up of a barber using an electric razor to trim a client's beard in a well-lit barbershop. The focus is on the barber's hands and the razor, with the background showing various barber tools and equipment.

