In [1]:
import os

os.environ["HF_HOME"] = "/scratch/mmm9912/hf_cache"
os.environ["HF_DATASETS_CACHE"] = "/scratch/mmm9912/hf_cache/datasets"
os.environ["XDG_CACHE_HOME"] = "/scratch/mmm9912/hf_cache"

# Verify that the cache paths are set
for var in ["HF_HOME", "HF_DATASETS_CACHE", "XDG_CACHE_HOME"]:
    print(f"{var}: {os.environ.get(var)}")

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

HF_HOME: /scratch/mmm9912/hf_cache
HF_DATASETS_CACHE: /scratch/mmm9912/hf_cache/datasets
XDG_CACHE_HOME: /scratch/mmm9912/hf_cache


In [2]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    load_in_8bit=True,          # Enable 8-bit quantization for inference
    device_map="sequential",    # Load layers sequentially
    offload_folder="/scratch/mmm9912/qwen25vl_offload"  # Offload weights to CPU as needed
)

# Also lower the expected visual resolution
min_pixels = 128 * 28 * 28   # Drastically reduced from defaults
max_pixels = 640 * 28 * 28

processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    min_pixels=min_pixels,
    max_pixels=max_pixels
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 5/5 [00:09<00:00,  1.86s/it]


In [38]:

messages = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": (
                    "You are an expert in video forensics and deepfake detection."
                    "Your task is to analyze the provided image and determine whether it is from a deepfake video. "
                    "Consider multiple factors such as inconsistencies. But do not consider compression-related"
                    "artifacts, as those are only due to electronic media sharing."
                    "Provide a structured, concise and succinct analysis with specific reasoning before concluding whether the image is real or a deepfake."
                )
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "file://./FF-NT-000frame.png",
            },
            {
                "type": "text",
                "text": "Tell me whether this image is a deepfake and outline your reasoning. Your answer should be either REAL or DEEPFAKE; nothing else."
            }
        ]
    }
]

In [39]:
# Convert user messages into a text prompt (including special tokens)
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# Preprocess image and text for the model
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    return_tensors="pt",
    padding=True
).to(model.device)


In [40]:
with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=4
    )


In [41]:
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# Decode the generated tokens into text
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print("Model output:", output_text[0])


Model output: REAL
