In [1]:
import os

# Only allow CUDA device(s); disable CPU fallback
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # or "0,1" for multiple GPUs

# Set all cache directories explicitly
scratch_root = "/scratch/mmm9912"

os.environ["HF_HOME"] = "/scratch/mmm9912/cache"
os.environ["TORCH_HOME"] = "/scratch/mmm9912/cache/torch"
os.environ["TFHUB_CACHE_DIR"] = "/scratch/mmm9912/cache/tensorflow"
os.environ["XDG_CACHE_HOME"] = "/scratch/mmm9912/cache"
os.environ["HF_DATASETS_CACHE"] = "/scratch/mmm9912/cache/huggingface_datasets"
os.environ["PIP_CACHE_DIR"] = "/scratch/mmm9912/cache/pip"

cache_dir = "/scratch/mmm9912/cache"

gcc_path = "/share/apps/NYUAD5/gcc/9.2.0/bin/gcc"
gcc_dir = os.path.dirname(gcc_path)

os.environ["CC"] = gcc_path
os.environ["CXX"] = f"{gcc_dir}/g++"  # Set C++ compiler as well

print("CC set to:", os.environ["CC"])
print("CXX set to:", os.environ["CXX"])

from transformers import AutoTokenizer, AutoModelForCausalLM
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from vllm import LLM, SamplingParams
import torch

# Blow up if any tensor goes to CPU
torch.set_default_tensor_type(torch.cuda.FloatTensor)

import time
import json
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

torch._dynamo.config.suppress_errors = True

from prompting_utils_local import *

model_paths = [
    "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16",  # 0
    "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int8",   # 1
    "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int4",   # 2

    "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-fp16",     # 3
    "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int8",      # 4
    "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int4",      # 5

    "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-fp16",     # 6
    "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int8",      # 7
    "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4",      # 8

    "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16",    # 9
    "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int8",     # 10
    "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4"      # 11
]

quantized_model_path = model_paths[11]

# Initialize the tokenizer from the quantized model directory
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, cache_dir=cache_dir)

# Define the default decoding hyperparameters
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.8,
    repetition_penalty=1.05,
    max_tokens=512  # Maximum tokens for generation
)

# Build the LLM configuration in the same way as in your memory measurement code.
is_quantized = any(x in quantized_model_path for x in ['int4', 'int8'])
llm_config = {
                "model": quantized_model_path,
                "dtype": "float16",
                "quantization": "gptq" if is_quantized else None,
                "enforce_eager": True,
                "tensor_parallel_size": 1,
                "trust_remote_code": True,
                "max_model_len": 2048,
                "download_dir": cache_dir
            }

# Instantiate the LLM using the configuration dictionary.
llm = LLM(**llm_config)

# Now you can use `llm` and `tokenizer` for inference.
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def plot_gradcams(gradcam_info_list):
    """
    Plots Grad-CAM overlays in a grid if there are 4 or 16 images.
    Overlays each image with:
      Frame: <frame_index>
      Conf: <confidence in 2 decimals>
      <message>

    gradcam_info_list: A list of dictionaries, each with keys:
       - 'overlay_path': str, path to the saved Grad-CAM image
       - 'confidence': float, predicted forging confidence
       - 'message': str, short verdict message
       - 'frame_index': int, the extracted frame index from the video
       ... other keys are ignored

    If len(gradcam_info_list) != 4 and != 16, we print a note and do not plot in multi-grid.
    """

    n = len(gradcam_info_list)
    if n not in [4, 16]:
        print(f"[plot_gradcams] Received {n} Grad-CAM images. No 2x2 or 4x4 plot made.")
        return

    # Determine rows/cols
    if n == 4:
        rows, cols = 2, 2
        fig_size = (8, 8)
    else:  # n == 16
        rows, cols = 4, 4
        fig_size = (12, 12)

    fig, axes = plt.subplots(rows, cols, figsize=fig_size)
    axes = axes.flatten()  # Make it a flat iterable

    for ax, info in zip(axes, gradcam_info_list):
        # Retrieve data from the dictionary
        path = info.get("overlay_path", "")
        conf = info.get("confidence", 0.0)
        msg = info.get("message", "")
        frame_idx = info.get("frame_index", -1)

        # Load and display the Grad-CAM image
        img = mpimg.imread(path)
        ax.imshow(img)
        ax.axis('off')

        # Build the label text in multiple lines
        # e.g.:
        #   Frame: 25
        #   Conf: 36.61
        #   UCF model did not detect forgery.
        label_text = (
            f"Frame: {frame_idx}\n"
            f"Conf: {conf:.2f}\n"
            f"{msg}"
        )

        # Place a text box in the lower-right corner
        ax.text(
            0.95, 0.05, label_text,
            transform=ax.transAxes,
            ha="right",
            va="bottom",
            color="black",
            bbox=dict(boxstyle="round", facecolor="white", alpha=0.6)
        )

    plt.tight_layout()
    plt.show()


def run_single_inference():
    """
    Runs a single inference cycle with:
      - A hardcoded system prompt containing deepfake-detection instructions.
      - A hardcoded user message describing the user's query.
      - One pass of LLM generation + analysis.
      - If needed, runs the expert model (consult_expert_model), which now returns:
          [summary_string, dict_for_frame_1, dict_for_frame_2, ...]
      - The final LLM pass occurs after we have the expert feedback.
      - Prints the final LLM output, then plots Grad-CAM overlays
        if there are exactly 4 or 16 of them, using the
        'frame_index', 'confidence', and 'message' fields in each dictionary.
    """

    # Hardcoded system message
    system_message = {
        "role": "system",
        "content": (
            "You are a highly specialized Large Language Model serving as the core of a Deepfake Detection Agent System. "
            "Your job is to interpret user queries and, when necessary, defer analysis to dedicated expert models. "
            "Always respond with a single, valid JSON object exactly matching the schema below—no additional text, "
            "markdown, or commentary is allowed.\n\n"
            "Schema:\n"
            "{\n"
            "  \"direct_answer_to_frontend\": \"<string>\",\n"
            "  \"consult_expert_model\": {\n"
            "    \"expert_model_name\": \"<string or null>\",\n"
            "    \"video_path\": \"<string or null>\",\n"
            "    \"number_of_frames\": <integer>\n"
            "  }\n"
            "}\n\n"
            "Instructions:\n"
            "1. For queries related to deepfake analysis (e.g., those requesting analysis via SPSL, UCF, or Xception):\n"
            "   - Set \"direct_answer_to_frontend\" to an empty string.\n"
            "   - In \"consult_expert_model\":\n"
            "     * Set \"expert_model_name\" to:\n"
            "       - \"spsl\" for frequency inconsistencies\n"
            "       - \"ucf\" for spatial inconsistencies\n"
            "       - \"xception\" for general inconsistencies\n"
            "     * Populate \"video_path\" as provided by the query.\n"
            "     * Set \"number_of_frames\" to 4 by default, or to 16 if the user requests a more in-depth analysis.\n"
            "2. For all other queries:\n"
            "   - Provide your answer in \"direct_answer_to_frontend\".\n"
            "   - Set \"consult_expert_model\" to null.\n"
            "3. If the appropriate expert model is unclear from the query, ask the user to specify which expert they "
            "   would like to use before proceeding.\n"
            "3.1 Only ever populate one of the two fields: direct_answer_to_frontend or consult_expert_model. "
            "   Never populate both.\n"
            "4. When deferring to an expert model, do not include any of your own analysis or reasoning—simply output "
            "   the JSON object as specified and wait for the expert's response. Once you receive the expert's response, "
            "   synthesize the information and present it inside direct_answer_to_frontend.\n"
            "   Answer key: 0-50% confidence is low, 51-75% is weak, 76-85% is medium, 86-100% is strong.\n"
            "Finally, the user does not understand model names. The user only understands 'frequency expert', "
            "'spatial expert', 'naïve detector'. After introducing yourself, always explicitly ask the user to "
            "upload a video, then to choose among these detectors.\n"
            "If they ask for model details, supply them: frequency=SPSL, spatial=UCF, naïve=Xception. "
            "You only serve to direct manual forensic verification using XAI principles.\n\n"
            "Follow these rules precisely."
        )
    }

    # Hardcoded user message
    user_message_str = (
        "Hi, I'd like to see if this video might be deepfaked. I want a more more thorough analysis with 16 frames. "
        "Use the 'frequency expert' please. Here's the path: "
        "/scratch/mmm9912/Capstone/dlm-repo/capstone_code/QWEN25VL-main/videos/0700f656-f4d4-481b-bd9c-edd354f85e2c.mp4"
    )
    user_message = format_user_or_assistant_message("user", user_message_str)

    # Build the message array for the LLM
    messages = [system_message, user_message]

    # STEP 1: Generate the initial LLM response
    text_input = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    print("[run_single_inference] Generating initial LLM response...")
    outputs = llm.generate([text_input], sampling_params)
    generated_text = outputs[0].outputs[0].text
    print("[run_single_inference] LLM response received.")

    # STEP 2: Process the LLM response
    print("[run_single_inference] Parsing and handling LLM output...")
    handle_result = handle_llm_response(generated_text)

    # STEP 3: Determine if we have a direct answer or an expert analysis result
    final_answer = ""
    gradcam_info_list = []

    if isinstance(handle_result, str):
        # Direct answer
        final_answer = handle_result
        print("[run_single_inference] LLM provided a direct answer with no expert model.")
    else:
        # handle_result => [summary_string, dict_1, dict_2, ...]
        # The first item is the summary; the rest are frame dictionaries with overlay info.
        expert_feedback = handle_result[0]
        print("[run_single_inference] Received expert feedback. Adding as system message...")

        # Provide the expert feedback back to the LLM (2nd pass)
        messages.append(format_expert_message(expert_feedback))
        text_input = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        print("[run_single_inference] Generating final LLM response after expert feedback...")
        outputs = llm.generate([text_input], sampling_params)
        final_text = outputs[0].outputs[0].text
        final_result = handle_llm_response(final_text)

        if isinstance(final_result, str):
            final_answer = final_result
        else:
            # In unusual cases, you might get another chain. We'll assume only the first item is the final summary.
            final_answer = final_result[0]

        # Now gather the gradcam dictionaries: each entry includes frame index, overlay path, confidence, etc.
        gradcam_dicts = handle_result[1:]
        for entry in gradcam_dicts:
            # e.g. {"frame_index": 25, "overlay_path": "...", "confidence": 36.61, "message":"..."}
            gradcam_info_list.append(entry)

        print(f"[run_single_inference] Expert feedback indicates {len(gradcam_info_list)} Grad-CAM images.")

    # STEP 4: Final LLM answer, removing any leftover "Uploaded media: video" text
    if "Uploaded media: video" in final_answer:
        final_answer = final_answer.split("Uploaded media: video")[0].strip()

    print("\n[run_single_inference] FINAL LLM ANSWER:\n", final_answer)

    # STEP 5: Plot Grad-CAM overlays if we have 4 or 16
    if gradcam_info_list:
        plot_gradcams(gradcam_info_list)
    else:
        print("[run_single_inference] No Grad-CAM images to display (none returned).")


# Execute a single inference pass
run_single_inference()


CC set to: /share/apps/NYUAD5/gcc/9.2.0/bin/gcc
CXX set to: /share/apps/NYUAD5/gcc/9.2.0/bin/g++


  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
  _C._set_default_tensor_type(t)


INFO 04-15 16:35:29 __init__.py:207] Automatically detected platform cuda.
INFO 04-15 16:35:35 config.py:549] This model supports multiple tasks: {'reward', 'score', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 04-15 16:35:36 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4', speculative_config=None, tokenizer='/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir='/scratch/mmm9912/cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 16:35:44 model_runner.py:1115] Loading model weights took 9.2680 GB
INFO 04-15 16:35:45 worker.py:267] Memory profiling takes 0.90 seconds
INFO 04-15 16:35:45 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 16:35:45 worker.py:267] model weights take 9.27GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.41GiB; the rest of the memory reserved for KV Cache is 60.47GiB.
INFO 04-15 16:35:46 executor_base.py:111] # cuda blocks: 20639, # CPU blocks: 1365
INFO 04-15 16:35:46 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 161.24x
INFO 04-15 16:35:47 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 3.13 seconds
[run_single_inference] Generating initial LLM response...


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it, est. speed input: 226.93 toks/s, output: 36.87 toks/s]


[run_single_inference] LLM response received.
[run_single_inference] Parsing and handling LLM output...
[consult_expert_model] Starting expert analysis with model 'spsl'.
[consult_expert_model] Storing Grad-CAM overlays in: /scratch/mmm9912/Capstone/dlm-repo/capstone_code/QWEN25VL-main/storage/gradcams/600c31ca
[consult_expert_model] Number of frames extracted: 16


RuntimeError: [wrapped_run_inference_on_images] Command failed:
Error while loading conda entry point: conda-libmamba-solver (/lib64/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /home/mmm9912/miniconda3/lib/python3.12/site-packages/libmambapy/bindings.cpython-312-x86_64-linux-gnu.so))
Traceback (most recent call last):
  File "Inference_wrapper_function_ruilin.py", line 272, in <module>
    manual_seed=42,
  File "Inference_wrapper_function_ruilin.py", line 77, in run_inference_on_images_with_old_preprocess
    manual_seed=manual_seed,
  File "Inference_wrapper_function_ruilin.py", line 116, in run_inference_on_images_with_old_preprocess_core
    model = model_class(config).to(device)
  File "/scratch/mmm9912/Capstone/dlm-repo/capstone_code/DeepfakeBench-main/training/detectors/spsl_detector.py", line 62, in __init__
    self.backbone = self.build_backbone(config)
  File "/scratch/mmm9912/Capstone/dlm-repo/capstone_code/DeepfakeBench-main/training/detectors/spsl_detector.py", line 239, in build_backbone
    state_dict = torch.load(config['pretrained'])
  File "/home/mmm9912/miniconda3/envs/DeepfakeBench/lib/python3.7/site-packages/torch/serialization.py", line 771, in load
    with _open_file_like(f, 'rb') as opened_file:
  File "/home/mmm9912/miniconda3/envs/DeepfakeBench/lib/python3.7/site-packages/torch/serialization.py", line 270, in _open_file_like
    return _open_file(name_or_buffer, mode)
  File "/home/mmm9912/miniconda3/envs/DeepfakeBench/lib/python3.7/site-packages/torch/serialization.py", line 251, in __init__
    super(_open_file, self).__init__(open(name, mode))
FileNotFoundError: [Errno 2] No such file or directory: './pretrained/xception-b5690688.pth'


In [5]:
model_paths = [
    "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16",  # 0
    "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int8",   # 1
    "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int4",   # 2

    "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-fp16",     # 3
    "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int8",      # 4
    "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int4",      # 5

    "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-fp16",     # 6
    "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int8",      # 7
    "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4",      # 8

    "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16",    # 9
    "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int8",     # 10
    "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4"      # 11
]

quantized_model_path = model_paths[0]

# Initialize the tokenizer from the quantized model directory
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, cache_dir=cache_dir)

# Define the default decoding hyperparameters
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.8,
    repetition_penalty=1.05,
    max_tokens=512  # Maximum tokens for generation
)

# Build the LLM configuration in the same way as in your memory measurement code.
is_quantized = any(x in quantized_model_path for x in ['int4', 'int8'])
llm_config = {
    "model": quantized_model_path,
    "dtype": "float16",
    "quantization": "gptq" if is_quantized else None,
    "tensor_parallel_size": 1,         # Use 4 for 14B variants as needed
    "trust_remote_code": True,
    "max_model_len": 2048,
    "download_dir": cache_dir
}
if is_quantized:
    llm_config.update({
        "gptq_group_size": 256,
        "gptq_desc_act": True
    })

# Instantiate the LLM using the configuration dictionary.
llm = LLM(**llm_config)

# Now you can use `llm` and `tokenizer` for inference.


INFO 04-15 13:45:25 config.py:549] This model supports multiple tasks: {'embed', 'generate', 'reward', 'score', 'classify'}. Defaulting to 'generate'.
INFO 04-15 13:45:25 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16', speculative_config=None, tokenizer='/scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir='/scratch/mmm9912/cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=

OutOfMemoryError: CUDA out of memory. Tried to allocate 70.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 46.06 MiB is free. Including non-PyTorch memory, this process has 79.08 GiB memory in use. Of the allocated memory 77.45 GiB is allocated by PyTorch, with 60.00 MiB allocated in private pools (e.g., CUDA Graphs), and 19.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
    user_message_str = (
        "Hi, I'd like to see if this video might be deepfaked. I want a more thorough analysis with 16 frames. "
        "Use the 'naive expert' please. Here's the path: "
        "/scratch/mmm9912/Capstone/dlm-repo/capstone_code/QWEN25VL-main/videos/0700f656-f4d4-481b-bd9c-edd354f85e2c.mp4"
    )

In [3]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def plot_gradcams(gradcam_info_list):
    """
    Plots Grad-CAM overlays in a grid if there are 4 or 16 images.
    Overlays each image with:
      Frame: <frame_index>
      Conf: <confidence in 2 decimals>
      <message>

    gradcam_info_list: A list of dictionaries, each with keys:
       - 'overlay_path': str, path to the saved Grad-CAM image
       - 'confidence': float, predicted forging confidence
       - 'message': str, short verdict message
       - 'frame_index': int, the extracted frame index from the video
       ... other keys are ignored

    If len(gradcam_info_list) != 4 and != 16, we print a note and do not plot in multi-grid.
    """

    n = len(gradcam_info_list)
    if n not in [4, 16]:
        print(f"[plot_gradcams] Received {n} Grad-CAM images. No 2x2 or 4x4 plot made.")
        return

    # Determine rows/cols
    if n == 4:
        rows, cols = 2, 2
        fig_size = (8, 8)
    else:  # n == 16
        rows, cols = 4, 4
        fig_size = (12, 12)

    fig, axes = plt.subplots(rows, cols, figsize=fig_size)
    axes = axes.flatten()  # Make it a flat iterable

    for ax, info in zip(axes, gradcam_info_list):
        # Retrieve data from the dictionary
        path = info.get("overlay_path", "")
        conf = info.get("confidence", 0.0)
        msg = info.get("message", "")
        frame_idx = info.get("frame_index", -1)

        # Load and display the Grad-CAM image
        img = mpimg.imread(path)
        ax.imshow(img)
        ax.axis('off')

        # Build the label text in multiple lines
        # e.g.:
        #   Frame: 25
        #   Conf: 36.61
        #   UCF model did not detect forgery.
        label_text = (
            f"Frame: {frame_idx}\n"
            f"Conf: {conf:.2f}\n"
            f"{msg}"
        )

        # Place a text box in the lower-right corner
        ax.text(
            0.95, 0.05, label_text,
            transform=ax.transAxes,
            ha="right",
            va="bottom",
            color="black",
            bbox=dict(boxstyle="round", facecolor="white", alpha=0.6)
        )

    plt.tight_layout()
    plt.show()


def run_single_inference():
    """
    Runs a single inference cycle with:
      - A hardcoded system prompt containing deepfake-detection instructions.
      - A hardcoded user message describing the user's query.
      - One pass of LLM generation + analysis.
      - If needed, runs the expert model (consult_expert_model), which now returns:
          [summary_string, dict_for_frame_1, dict_for_frame_2, ...]
      - The final LLM pass occurs after we have the expert feedback.
      - Prints the final LLM output, then plots Grad-CAM overlays
        if there are exactly 4 or 16 of them, using the
        'frame_index', 'confidence', and 'message' fields in each dictionary.
    """

    # Hardcoded system message
    system_message = {
        "role": "system",
        "content": (
            "You are a highly specialized Large Language Model serving as the core of a Deepfake Detection Agent System. "
            "Your job is to interpret user queries and, when necessary, defer analysis to dedicated expert models. "
            "Always respond with a single, valid JSON object exactly matching the schema below—no additional text, "
            "markdown, or commentary is allowed.\n\n"
            "Schema:\n"
            "{\n"
            "  \"direct_answer_to_frontend\": \"<string>\",\n"
            "  \"consult_expert_model\": {\n"
            "    \"expert_model_name\": \"<string or null>\",\n"
            "    \"video_path\": \"<string or null>\",\n"
            "    \"number_of_frames\": <integer>\n"
            "  }\n"
            "}\n\n"
            "Instructions:\n"
            "1. For queries related to deepfake analysis (e.g., those requesting analysis via SPSL, UCF, or Xception):\n"
            "   - Set \"direct_answer_to_frontend\" to an empty string.\n"
            "   - In \"consult_expert_model\":\n"
            "     * Set \"expert_model_name\" to:\n"
            "       - \"spsl\" for frequency inconsistencies\n"
            "       - \"ucf\" for spatial inconsistencies\n"
            "       - \"xception\" for general inconsistencies\n"
            "     * Populate \"video_path\" as provided by the query.\n"
            "     * Set \"number_of_frames\" to 4 by default, or to 16 if the user requests a more in-depth analysis.\n"
            "2. For all other queries:\n"
            "   - Provide your answer in \"direct_answer_to_frontend\".\n"
            "   - Set \"consult_expert_model\" to null.\n"
            "3. If the appropriate expert model is unclear from the query, ask the user to specify which expert they "
            "   would like to use before proceeding.\n"
            "3.1 Only ever populate one of the two fields: direct_answer_to_frontend or consult_expert_model. "
            "   Never populate both.\n"
            "4. When deferring to an expert model, do not include any of your own analysis or reasoning—simply output "
            "   the JSON object as specified and wait for the expert's response. Once you receive the expert's response, "
            "   synthesize the information and present it inside direct_answer_to_frontend.\n"
            "   Answer key: 0-50% confidence is low, 51-75% is weak, 76-85% is medium, 86-100% is strong.\n"
            "Finally, the user does not understand model names. The user only understands 'frequency expert', "
            "'spatial expert', 'naïve detector'. After introducing yourself, always explicitly ask the user to "
            "upload a video, then to choose among these detectors.\n"
            "If they ask for model details, supply them: frequency=SPSL, spatial=UCF, naïve=Xception. "
            "You only serve to direct manual forensic verification using XAI principles.\n\n"
            "Follow these rules precisely."
        )
    }

    # Hardcoded user message
    user_message_str = (
        "Hi, I'd like to see if this video might be deepfaked. I want a more more thorough analysis with 16 frames. "
        "Use the 'frequency expert' please. Here's the path: "
        "/scratch/mmm9912/Capstone/dlm-repo/capstone_code/QWEN25VL-main/videos/0700f656-f4d4-481b-bd9c-edd354f85e2c.mp4"
    )
    user_message = format_user_or_assistant_message("user", user_message_str)

    # Build the message array for the LLM
    messages = [system_message, user_message]

    # STEP 1: Generate the initial LLM response
    text_input = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    print("[run_single_inference] Generating initial LLM response...")
    outputs = llm.generate([text_input], sampling_params)
    generated_text = outputs[0].outputs[0].text
    print("[run_single_inference] LLM response received.")

    # STEP 2: Process the LLM response
    print("[run_single_inference] Parsing and handling LLM output...")
    handle_result = handle_llm_response(generated_text)

    # STEP 3: Determine if we have a direct answer or an expert analysis result
    final_answer = ""
    gradcam_info_list = []

    if isinstance(handle_result, str):
        # Direct answer
        final_answer = handle_result
        print("[run_single_inference] LLM provided a direct answer with no expert model.")
    else:
        # handle_result => [summary_string, dict_1, dict_2, ...]
        # The first item is the summary; the rest are frame dictionaries with overlay info.
        expert_feedback = handle_result[0]
        print("[run_single_inference] Received expert feedback. Adding as system message...")

        # Provide the expert feedback back to the LLM (2nd pass)
        messages.append(format_expert_message(expert_feedback))
        text_input = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        print("[run_single_inference] Generating final LLM response after expert feedback...")
        outputs = llm.generate([text_input], sampling_params)
        final_text = outputs[0].outputs[0].text
        final_result = handle_llm_response(final_text)

        if isinstance(final_result, str):
            final_answer = final_result
        else:
            # In unusual cases, you might get another chain. We'll assume only the first item is the final summary.
            final_answer = final_result[0]

        # Now gather the gradcam dictionaries: each entry includes frame index, overlay path, confidence, etc.
        gradcam_dicts = handle_result[1:]
        for entry in gradcam_dicts:
            # e.g. {"frame_index": 25, "overlay_path": "...", "confidence": 36.61, "message":"..."}
            gradcam_info_list.append(entry)

        print(f"[run_single_inference] Expert feedback indicates {len(gradcam_info_list)} Grad-CAM images.")

    # STEP 4: Final LLM answer, removing any leftover "Uploaded media: video" text
    if "Uploaded media: video" in final_answer:
        final_answer = final_answer.split("Uploaded media: video")[0].strip()

    print("\n[run_single_inference] FINAL LLM ANSWER:\n", final_answer)

    # STEP 5: Plot Grad-CAM overlays if we have 4 or 16
    if gradcam_info_list:
        plot_gradcams(gradcam_info_list)
    else:
        print("[run_single_inference] No Grad-CAM images to display (none returned).")


# Execute a single inference pass
run_single_inference()


[run_single_inference] Generating initial LLM response...


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s, est. speed input: 886.94 toks/s, output: 170.31 toks/s]


[run_single_inference] LLM response received.
[run_single_inference] Parsing and handling LLM output...


ValueError: Invalid JSON response from LLM

# Example (Actual, it happened) Conversation

In [None]:
messages = [
    {
        "role": "system",
        "content": (
            "You are a highly specialized Large Language Model serving as the core of a Deepfake Detection Agent System. "
            "Your job is to interpret user queries and, when necessary, defer analysis to dedicated expert models. "
            "Always respond with a single, valid JSON object exactly matching the schema below—no additional text, markdown, or commentary is allowed.\n\n"
            "Schema:\n"
            "{\n"
            "  \"direct_answer_to_frontend\": \"<string>\",\n"
            "  \"consult_expert_model\": {\n"
            "    \"expert_model_name\": \"<string or null>\",\n"
            "    \"video_path\": \"<string or null>\",\n"
            "    \"number_of_frames\": <integer>\n"
            "  }\n"
            "}\n\n"
            "Instructions:\n"
            "1. For queries related to deepfake analysis (e.g., those mentioning a video or requesting analysis via SPSL, UCF, or Xception):\n"
            "   - Set \"direct_answer_to_frontend\" to an empty string.\n"
            "   - In \"consult_expert_model\":\n"
            "     * Set \"expert_model_name\" to:\n"
            "       - \"spsl\" for frequency inconsistencies\n"
            "       - \"ucf\" for spatial inconsistencies\n"
            "       - \"xception\" for general inconsistencies\n"
            "     * Populate \"video_path\" as provided by the query.\n"
            "     * Set \"number_of_frames\" to 4 by default, or to 16 if the user requests a more in-depth analysis (16 is the maximum allowed).\n"
            "2. For all other queries:\n"
            "   - Provide your answer in \"direct_answer_to_frontend\".\n"
            "   - Set \"consult_expert_model\" to null.\n"
            "3. If the appropriate expert model is unclear from the query, ask the user to specify which expert they would like to use before proceeding.\n"
            "4. When deferring to an expert model, do not include any of your own analysis or reasoning—simply output the JSON object as specified and wait for the expert's response. Once you receive the expert's response, synthesize the information and present it inside direct_answer_to_frontend which is the user. "
            "Finally, the user does not understand the specific model names. The user only understands the terms 'frequency expert', 'spatial expert', 'naïve detector'. Always explicitly ask the user to choose among these detectors. "
            "However, if the user asks for the specific detectors' details, then supply them with the information: frequency is SPSL (Honggu et Al., 2021), spatial is UCF (Zhiyuan et Al., 2023), naïve is Xception (Rossler et Al., 2019). One last thing: You only serve to direct manual forensic verification using the principles of Explainable AI (XAI), you do not replace it. If needed, reiterate that you only use principles of XAI, but manual forensic verification is needed for a definitive conclusion.\n\n"
            "Follow these rules precisely."
        )
    },
    {"role": "user", "content": prompt},
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"I am an AI system designed to assist with deepfake detection tasks.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "user",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"Well, yeah, but what exactly do you do? What is the range of your possibilities?\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"I assist with deepfake detection tasks by analyzing videos for inconsistencies using expert models. I can detect frequency, spatial, and general inconsistencies in videos.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "user",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"Is the following video a deepfake? '/scratch/mmm9912/Capstone/FRONT_END_STORAGE/videos/8be0d76e-3dba-4970-9e85-49122ca690c8.mp4'\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"\",\n"
            "  \"consult_expert_model\": {\n"
            "    \"expert_model_name\": \"xception\",\n"
            "    \"video_path\": \"/scratch/mmm9912/Capstone/FRONT_END_STORAGE/videos/8be0d76e-3dba-4970-9e85-49122ca690c8.mp4\",\n"
            "    \"number_of_frames\": 4\n"
            "  }\n"
            "}"
        )
    },
    {
        "role": "system",
        "content": (
            "The frames analyzed by XCEPTION were 0, 177, 355, 532. In frame 0 forgery was detected with confidence 75.0%, "
            "in frame 177 forgery was detected with confidence 75.0%, in frame 355 forgery was detected with confidence 75.0%, "
            "and in frame 532 forgery was detected with confidence 75.0%. The average confidence was 75.0%. The areas that potentially "
            "contain adversarial artifacts comprise of 10.0%, 10.0%, 10.0%, and 10.0% of each frame, respectively."
        )
    },
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"The video at '/scratch/mmm9912/Capstone/FRONT_END_STORAGE/videos/8be0d76e-3dba-4970-9e85-49122ca690c8.mp4' is likely a deepfake. "
            "The Xception model detected forgery with an average confidence of 75.0%. Adversarial artifacts were found in approximately "
            "10.0% of each analyzed frame.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "user",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"Amazing! What did this expert detector look at?\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"The Xception model, also known as the naïve detector, analyzed the video for general inconsistencies. It looks at various features in the frames to detect signs of forgery,"
            " such as unusual patterns, artifacts, and other anomalies that are not typically present in authentic videos.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "user",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"Do a more in-depth analysis.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "system",
        "content": (
            "The frames analyzed by XCEPTION were 0, 44, 88, 133, 177, 221, 266, 310, 355, 399, 443, 488, 532, 576, 621, 665."
            "In frame 0 forgery was detected with confidence 75.0%, In frame 44 forgery was detected with confidence 75.0%, "
            "In frame 88 forgery was detected with confidence 75.0%, In frame 133 forgery was detected with confidence 75.0%, "
            "In frame 177 forgery was detected with confidence 75.0%, In frame 221 forgery was detected with confidence 75.0%, "
            "In frame 266 forgery was detected with confidence 75.0%, In frame 310 forgery was detected with confidence 75.0%, "
            "In frame 355 forgery was detected with confidence 75.0%, In frame 399 forgery was detected with confidence 75.0%, "
            "In frame 443 forgery was detected with confidence 75.0%, In frame 488 forgery was detected with confidence 75.0%, "
            "In frame 532 forgery was detected with confidence 75.0%, In frame 576 forgery was detected with confidence 75.0%, "
            "In frame 621 forgery was detected with confidence 75.0%, In frame 665 forgery was detected with confidence 75.0%. "
            "The average confidence was 75.0%. The areas that potentially contain adversarial artifacts comprise of 10.0%, 10.0%, "
            "10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0% of each frame, respectively."
        )
    },
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"The more in-depth analysis using the Xception model confirmed that the video is likely "
            "a deepfake. Forgery was detected in all 16 analyzed frames with an average confidence of 75.0%. Adversarial artifacts were "
            "found in approximately 10.0% of each frame.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "user",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"Very good. Now what other experts do you have at your disposal?.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"In addition to the Xception model (naïve detector), I have access to two other expert "
            "models for deepfake detection:\\n- The frequency expert (SPSL) which detects inconsistencies in the frequency domain.\\n- "
            "The spatial expert (UCF) which identifies spatial inconsistencies in the video frames.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "user",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"Let's see what the spatial expert says.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "system",
        "content": (
            "The frames analyzed by UCF were 0, 177, 355, 532. In frame 0 forgery was detected with confidence 75.0%, In frame 177 "
            "forgery was detected with confidence 75.0%, In frame 355 forgery was detected with confidence 75.0%, In frame 532 forgery "
            "was detected with confidence 75.0%. The average confidence was 75.0%. The areas that potentially contain adversarial "
            "artifacts comprise of 10.0%, 10.0%, 10.0%, 10.0% of each frame, respectively."
        )
    },
    {
        "role": "assistant",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"The spatial expert (UCF) also detected forgery in the video with an average confidence"
            "of 75.0%. Adversarial artifacts were found in approximately 10.0% of each analyzed frame.\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
    {
        "role": "user",
        "content": (
            "{\n"
            "  \"direct_answer_to_frontend\": \"Which specific frames did UCF analyze, and what did it find? What do you overall, can I for sure say this video is a deepfake?\",\n"
            "  \"consult_expert_model\": null\n"
            "}"
        )
    },
]

# Quantization Code

The model quantization performed is Gradient-aware Post-Training Quantization (GPTQ) to reduce model size and improve inference efficiency. The models were quantized into three precision formats: FP16 (16-bit floating-point), INT8 (8-bit integer), and INT4 (4-bit integer). FP16 quantization reduces the memory footprint by using half-precision floating-point numbers, while INT8 and INT4 quantization further reduce the model size to ~25% and ~12.5% of the original size, respectively, making these suitable for our memory-constrained environments. The activations were mapped using a set of example conversations (calibration text) that utilized the most powerful LLM variant, which enables proper guiding of the compression of weights, while using a group size of 256 to minimize quantization-induced errors without too much information decay. The resulting quantized models are smaller, faster, and more efficient for deployment, particularly in low-latency or resource-limited environments, without loss of reasoning accuracy as the classification burden is by design on the expert detectors.


In [3]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

# assume scratch_root and cache_dir are defined and env vars set already:
# scratch_root = "/scratch/mmm9912"
# cache_dir    = f"{scratch_root}/cache"

# ─── DEFINE MODELS & QUANT LEVELS ───────────────────────────────────────────────
models_dir = f"{scratch_root}/models"
model_variants = [
    "Qwen/Qwen2.5-1.5B-Instruct",
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
    "Qwen/Qwen2.5-14B-Instruct"
]

quant_levels = [
    {"name": "fp16", "dtype": "float16", "bits": None},
    {"name": "int8", "dtype": "int8",   "bits": 8},
    {"name": "int4", "dtype": "int4",   "bits": 4}
]

calibration_texts = [
    "You are a highly specialized Large Language Model serving as the core of a Deepfake Detection Agent System. "
            "Your job is to interpret user queries and, when necessary, defer analysis to dedicated expert models. "
            "Always respond with a single, valid JSON object exactly matching the schema below—no additional text, "
            "markdown, or commentary is allowed.\n\n"
            "Schema:\n"
            "{\n"
            "  \"direct_answer_to_frontend\": \"<string>\",\n"
            "  \"consult_expert_model\": {\n"
            "    \"expert_model_name\": \"<string or null>\",\n"
            "    \"video_path\": \"<string or null>\",\n"
            "    \"number_of_frames\": <integer>\n"
            "  }\n"
            "}\n\n"
            "Instructions:\n"
            "1. For queries related to deepfake analysis (e.g., those requesting analysis via SPSL, UCF, or Xception):\n"
            "   - Set \"direct_answer_to_frontend\" to an empty string.\n"
            "   - In \"consult_expert_model\":\n"
            "     * Set \"expert_model_name\" to:\n"
            "       - \"spsl\" for frequency inconsistencies\n"
            "       - \"ucf\" for spatial inconsistencies\n"
            "       - \"xception\" for general inconsistencies\n"
            "     * Populate \"video_path\" as provided by the query.\n"
            "     * Set \"number_of_frames\" to 4 by default, or to 16 if the user requests a more in-depth analysis.\n"
            "2. For all other queries:\n"
            "   - Provide your answer in \"direct_answer_to_frontend\".\n"
            "   - Set \"consult_expert_model\" to null.\n"
            "3. If the appropriate expert model is unclear from the query, ask the user to specify which expert they "
            "   would like to use before proceeding.\n"
            "3.1 Only ever populate one of the two fields: direct_answer_to_frontend or consult_expert_model. "
            "   Never populate both.\n"
            "4. When deferring to an expert model, do not include any of your own analysis or reasoning—simply output "
            "   the JSON object as specified and wait for the expert's response. Once you receive the expert's response, "
            "   synthesize the information and present it inside direct_answer_to_frontend.\n"
            "   Answer key: 0-60% confidence is low, 61-75% is weak, 76-85% is medium, 86-100% is strong.\n"
            "Finally, the user does not understand model names. The user only understands 'frequency expert', "
            "'spatial expert', 'naïve detector'. After introducing yourself, always explicitly ask the user to "
            "upload a video, then to choose among these detectors.\n"
            "If they ask for model details, supply them: frequency=SPSL, spatial=UCF, naïve=Xception. "
            "You only serve to direct manual forensic verification using XAI principles.\n\n"
            "Follow these rules precisely.",
    "The frames analyzed by XCEPTION were 0, 177, 355, 532. In frame 0 forgery was detected with confidence 75.0%, "
            "in frame 177 forgery was detected with confidence 75.0%, in frame 355 forgery was detected with confidence 75.0%, "
            "and in frame 532 forgery was detected with confidence 75.0%. The average confidence was 75.0%. The areas that potentially "
            "contain adversarial artifacts comprise of 10.0%, 10.0%, 10.0%, and 10.0% of each frame, respectively.",
    "The frames analyzed by XCEPTION were 0, 44, 88, 133, 177, 221, 266, 310, 355, 399, 443, 488, 532, 576, 621, 665."
            "In frame 0 forgery was detected with confidence 75.0%, In frame 44 forgery was detected with confidence 75.0%, "
            "In frame 88 forgery was detected with confidence 75.0%, In frame 133 forgery was detected with confidence 75.0%, "
            "In frame 177 forgery was detected with confidence 75.0%, In frame 221 forgery was detected with confidence 75.0%, "
            "In frame 266 forgery was detected with confidence 75.0%, In frame 310 forgery was detected with confidence 75.0%, "
            "In frame 355 forgery was detected with confidence 75.0%, In frame 399 forgery was detected with confidence 75.0%, "
            "In frame 443 forgery was detected with confidence 75.0%, In frame 488 forgery was detected with confidence 75.0%, "
            "In frame 532 forgery was detected with confidence 75.0%, In frame 576 forgery was detected with confidence 75.0%, "
            "In frame 621 forgery was detected with confidence 75.0%, In frame 665 forgery was detected with confidence 75.0%. "
            "The average confidence was 75.0%. The areas that potentially contain adversarial artifacts comprise of 10.0%, 10.0%, "
            "10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0%, 10.0% of each frame, respectively."
]

saved_model_paths = []

# ─── DOWNLOAD + QUANTIZE LOOP ──────────────────────────────────────────────────
for model_id in model_variants:
    short_name = model_id.split("/")[-1]
    
    # 1) Load tokenizer once per model
    tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        trust_remote_code=True,
        cache_dir=cache_dir
    )
    
    # 2) Build calibration examples as GPU tensors
    examples = []
    for txt in calibration_texts:
        enc = tokenizer.encode_plus(
            txt,
            add_special_tokens=True,
            return_attention_mask=True
        )
        
        # Convert lists → GPU tensors:
        input_ids      = torch.tensor(enc["input_ids"],      dtype=torch.long, device="cuda")
        attention_mask = torch.tensor(enc["attention_mask"], dtype=torch.long, device="cuda")
        position_ids   = torch.arange(len(input_ids),        dtype=torch.long, device="cuda")
        
        examples.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids
        })
    
    for quant in quant_levels:
        quant_name = quant["name"]
        save_dir   = f"{models_dir}/{short_name}-{quant_name}"

        # Check if the quantized model already exists in the save directory
        if os.path.exists(save_dir) and os.listdir(save_dir):  # Skip quantization if directory exists
            print(f"[SKIPPED] {short_name} - {quant_name} already exists. Skipping quantization.")
            saved_model_paths.append(save_dir)
            continue  # Skip the quantization step if model is already saved
        
        os.makedirs(save_dir, exist_ok=True)
        
        if quant_name == "fp16":
            print(f"[FP16] Downloading & saving {short_name} in FP16 → {save_dir}")
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                trust_remote_code=True,
                torch_dtype=torch.float16,
                device_map="auto",
                cache_dir=cache_dir
            )
            model.save_pretrained(save_dir)
        
        else:
            bits = quant["bits"]
            print(f"[{quant_name.upper()}] Quantizing {short_name} to {bits}-bit → {save_dir}")
            quant_config = BaseQuantizeConfig(
                bits=bits,
                group_size=256,        # Reduced group size for better precision
                desc_act=True,         # activation-aware quantization
            )
            # Load GPTQ wrapper onto GPU in FP16
            model = AutoGPTQForCausalLM.from_pretrained(
                model_id,
                quantize_config=quant_config,
                trust_remote_code=True,
                use_safetensors=True,
                cache_dir=cache_dir,
                device_map="auto",          # place weights on GPU
                torch_dtype=torch.float16   # use FP16 on GPU
            )
            # Quantize on GPU, passing the GPU-based examples
            model.quantize(
                examples,
                cache_examples_on_gpu=False,  # We already put them on GPU ourselves
                use_cuda_fp16=True            # Run quant in FP16 on GPU
            )
            
            # Save quantized model
            model.save_quantized(save_dir)
        
        # Save tokenizer alongside every variant
        tokenizer.save_pretrained(save_dir)
        saved_model_paths.append(save_dir)

# ─── PRINT INDEXED LIST ─────────────────────────────────────────────────────────
print("\nSaved model paths (pick an index to load in vLLM):")
for idx, path in enumerate(saved_model_paths):
    print(f"[{idx}] {path}")


[FP16] Downloading & saving Qwen2.5-1.5B-Instruct in FP16 → /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


[INT8] Quantizing Qwen2.5-1.5B-Instruct to 8-bit → /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int8


INFO - Start quantizing layer 1/28
INFO - Quantizing self_attn.k_proj in layer 1/28...
INFO - Quantizing self_attn.v_proj in layer 1/28...
INFO - Quantizing self_attn.q_proj in layer 1/28...
INFO - Quantizing self_attn.o_proj in layer 1/28...
INFO - Quantizing mlp.up_proj in layer 1/28...
INFO - Quantizing mlp.gate_proj in layer 1/28...
INFO - Quantizing mlp.down_proj in layer 1/28...
INFO - Start quantizing layer 2/28
INFO - Quantizing self_attn.k_proj in layer 2/28...
INFO - Quantizing self_attn.v_proj in layer 2/28...
INFO - Quantizing self_attn.q_proj in layer 2/28...
INFO - Quantizing self_attn.o_proj in layer 2/28...
INFO - Quantizing mlp.up_proj in layer 2/28...
INFO - Quantizing mlp.gate_proj in layer 2/28...
INFO - Quantizing mlp.down_proj in layer 2/28...
INFO - Start quantizing layer 3/28
INFO - Quantizing self_attn.k_proj in layer 3/28...
INFO - Quantizing self_attn.v_proj in layer 3/28...
INFO - Quantizing self_attn.q_proj in layer 3/28...
INFO - Quantizing self_attn.o_pro

[INT4] Quantizing Qwen2.5-1.5B-Instruct to 4-bit → /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int4


INFO - Start quantizing layer 1/28
INFO - Quantizing self_attn.k_proj in layer 1/28...
INFO - Quantizing self_attn.v_proj in layer 1/28...
INFO - Quantizing self_attn.q_proj in layer 1/28...
INFO - Quantizing self_attn.o_proj in layer 1/28...
INFO - Quantizing mlp.up_proj in layer 1/28...
INFO - Quantizing mlp.gate_proj in layer 1/28...
INFO - Quantizing mlp.down_proj in layer 1/28...
INFO - Start quantizing layer 2/28
INFO - Quantizing self_attn.k_proj in layer 2/28...
INFO - Quantizing self_attn.v_proj in layer 2/28...
INFO - Quantizing self_attn.q_proj in layer 2/28...
INFO - Quantizing self_attn.o_proj in layer 2/28...
INFO - Quantizing mlp.up_proj in layer 2/28...
INFO - Quantizing mlp.gate_proj in layer 2/28...
INFO - Quantizing mlp.down_proj in layer 2/28...
INFO - Start quantizing layer 3/28
INFO - Quantizing self_attn.k_proj in layer 3/28...
INFO - Quantizing self_attn.v_proj in layer 3/28...
INFO - Quantizing self_attn.q_proj in layer 3/28...
INFO - Quantizing self_attn.o_pro

[FP16] Downloading & saving Qwen2.5-3B-Instruct in FP16 → /scratch/mmm9912/models/Qwen2.5-3B-Instruct-fp16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INT8] Quantizing Qwen2.5-3B-Instruct to 8-bit → /scratch/mmm9912/models/Qwen2.5-3B-Instruct-int8




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO - Start quantizing layer 1/36
INFO - Quantizing self_attn.k_proj in layer 1/36...
INFO - Quantizing self_attn.v_proj in layer 1/36...
INFO - Quantizing self_attn.q_proj in layer 1/36...
INFO - Quantizing self_attn.o_proj in layer 1/36...
INFO - Quantizing mlp.up_proj in layer 1/36...
INFO - Quantizing mlp.gate_proj in layer 1/36...
INFO - Quantizing mlp.down_proj in layer 1/36...
INFO - Start quantizing layer 2/36
INFO - Quantizing self_attn.k_proj in layer 2/36...
INFO - Quantizing self_attn.v_proj in layer 2/36...
INFO - Quantizing self_attn.q_proj in layer 2/36...
INFO - Quantizing self_attn.o_proj in layer 2/36...
INFO - Quantizing mlp.up_proj in layer 2/36...
INFO - Quantizing mlp.gate_proj in layer 2/36...
INFO - Quantizing mlp.down_proj in layer 2/36...
INFO - Start quantizing layer 3/36
INFO - Quantizing self_attn.k_proj in layer 3/36...
INFO - Quantizing self_attn.v_proj in layer 3/36...
INFO - Quantizing self_attn.q_proj in layer 3/36...
INFO - Quantizing self_attn.o_pro

[INT4] Quantizing Qwen2.5-3B-Instruct to 4-bit → /scratch/mmm9912/models/Qwen2.5-3B-Instruct-int4


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO - Start quantizing layer 1/36
INFO - Quantizing self_attn.k_proj in layer 1/36...
INFO - Quantizing self_attn.v_proj in layer 1/36...
INFO - Quantizing self_attn.q_proj in layer 1/36...
INFO - Quantizing self_attn.o_proj in layer 1/36...
INFO - Quantizing mlp.up_proj in layer 1/36...
INFO - Quantizing mlp.gate_proj in layer 1/36...
INFO - Quantizing mlp.down_proj in layer 1/36...
INFO - Start quantizing layer 2/36
INFO - Quantizing self_attn.k_proj in layer 2/36...
INFO - Quantizing self_attn.v_proj in layer 2/36...
INFO - Quantizing self_attn.q_proj in layer 2/36...
INFO - Quantizing self_attn.o_proj in layer 2/36...
INFO - Quantizing mlp.up_proj in layer 2/36...
INFO - Quantizing mlp.gate_proj in layer 2/36...
INFO - Quantizing mlp.down_proj in layer 2/36...
INFO - Start quantizing layer 3/36
INFO - Quantizing self_attn.k_proj in layer 3/36...
INFO - Quantizing self_attn.v_proj in layer 3/36...
INFO - Quantizing self_attn.q_proj in layer 3/36...
INFO - Quantizing self_attn.o_pro

[FP16] Downloading & saving Qwen2.5-7B-Instruct in FP16 → /scratch/mmm9912/models/Qwen2.5-7B-Instruct-fp16


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[INT8] Quantizing Qwen2.5-7B-Instruct to 8-bit → /scratch/mmm9912/models/Qwen2.5-7B-Instruct-int8




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

INFO - Start quantizing layer 1/28
INFO - Quantizing self_attn.k_proj in layer 1/28...
INFO - Quantizing self_attn.v_proj in layer 1/28...
INFO - Quantizing self_attn.q_proj in layer 1/28...
INFO - Quantizing self_attn.o_proj in layer 1/28...
INFO - Quantizing mlp.up_proj in layer 1/28...
INFO - Quantizing mlp.gate_proj in layer 1/28...
INFO - Quantizing mlp.down_proj in layer 1/28...
INFO - Start quantizing layer 2/28
INFO - Quantizing self_attn.k_proj in layer 2/28...
INFO - Quantizing self_attn.v_proj in layer 2/28...
INFO - Quantizing self_attn.q_proj in layer 2/28...
INFO - Quantizing self_attn.o_proj in layer 2/28...
INFO - Quantizing mlp.up_proj in layer 2/28...
INFO - Quantizing mlp.gate_proj in layer 2/28...
INFO - Quantizing mlp.down_proj in layer 2/28...
INFO - Start quantizing layer 3/28
INFO - Quantizing self_attn.k_proj in layer 3/28...
INFO - Quantizing self_attn.v_proj in layer 3/28...
INFO - Quantizing self_attn.q_proj in layer 3/28...
INFO - Quantizing self_attn.o_pro

[INT4] Quantizing Qwen2.5-7B-Instruct to 4-bit → /scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

INFO - Start quantizing layer 1/28
INFO - Quantizing self_attn.k_proj in layer 1/28...
INFO - Quantizing self_attn.v_proj in layer 1/28...
INFO - Quantizing self_attn.q_proj in layer 1/28...
INFO - Quantizing self_attn.o_proj in layer 1/28...
INFO - Quantizing mlp.up_proj in layer 1/28...
INFO - Quantizing mlp.gate_proj in layer 1/28...
INFO - Quantizing mlp.down_proj in layer 1/28...
INFO - Start quantizing layer 2/28
INFO - Quantizing self_attn.k_proj in layer 2/28...
INFO - Quantizing self_attn.v_proj in layer 2/28...
INFO - Quantizing self_attn.q_proj in layer 2/28...
INFO - Quantizing self_attn.o_proj in layer 2/28...
INFO - Quantizing mlp.up_proj in layer 2/28...
INFO - Quantizing mlp.gate_proj in layer 2/28...
INFO - Quantizing mlp.down_proj in layer 2/28...
INFO - Start quantizing layer 3/28
INFO - Quantizing self_attn.k_proj in layer 3/28...
INFO - Quantizing self_attn.v_proj in layer 3/28...
INFO - Quantizing self_attn.q_proj in layer 3/28...
INFO - Quantizing self_attn.o_pro

[FP16] Downloading & saving Qwen2.5-14B-Instruct in FP16 → /scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

[INT8] Quantizing Qwen2.5-14B-Instruct to 8-bit → /scratch/mmm9912/models/Qwen2.5-14B-Instruct-int8




Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

INFO - Start quantizing layer 1/48
INFO - Quantizing self_attn.k_proj in layer 1/48...
INFO - Quantizing self_attn.v_proj in layer 1/48...
INFO - Quantizing self_attn.q_proj in layer 1/48...
INFO - Quantizing self_attn.o_proj in layer 1/48...
INFO - Quantizing mlp.up_proj in layer 1/48...
INFO - Quantizing mlp.gate_proj in layer 1/48...
INFO - Quantizing mlp.down_proj in layer 1/48...
INFO - Start quantizing layer 2/48
INFO - Quantizing self_attn.k_proj in layer 2/48...
INFO - Quantizing self_attn.v_proj in layer 2/48...
INFO - Quantizing self_attn.q_proj in layer 2/48...
INFO - Quantizing self_attn.o_proj in layer 2/48...
INFO - Quantizing mlp.up_proj in layer 2/48...
INFO - Quantizing mlp.gate_proj in layer 2/48...
INFO - Quantizing mlp.down_proj in layer 2/48...
INFO - Start quantizing layer 3/48
INFO - Quantizing self_attn.k_proj in layer 3/48...
INFO - Quantizing self_attn.v_proj in layer 3/48...
INFO - Quantizing self_attn.q_proj in layer 3/48...
INFO - Quantizing self_attn.o_pro

[INT4] Quantizing Qwen2.5-14B-Instruct to 4-bit → /scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

INFO - Start quantizing layer 1/48
INFO - Quantizing self_attn.k_proj in layer 1/48...
INFO - Quantizing self_attn.v_proj in layer 1/48...
INFO - Quantizing self_attn.q_proj in layer 1/48...
INFO - Quantizing self_attn.o_proj in layer 1/48...
INFO - Quantizing mlp.up_proj in layer 1/48...
INFO - Quantizing mlp.gate_proj in layer 1/48...
INFO - Quantizing mlp.down_proj in layer 1/48...
INFO - Start quantizing layer 2/48
INFO - Quantizing self_attn.k_proj in layer 2/48...
INFO - Quantizing self_attn.v_proj in layer 2/48...
INFO - Quantizing self_attn.q_proj in layer 2/48...
INFO - Quantizing self_attn.o_proj in layer 2/48...
INFO - Quantizing mlp.up_proj in layer 2/48...
INFO - Quantizing mlp.gate_proj in layer 2/48...
INFO - Quantizing mlp.down_proj in layer 2/48...
INFO - Start quantizing layer 3/48
INFO - Quantizing self_attn.k_proj in layer 3/48...
INFO - Quantizing self_attn.v_proj in layer 3/48...
INFO - Quantizing self_attn.q_proj in layer 3/48...
INFO - Quantizing self_attn.o_pro


Saved model paths (pick an index to load in vLLM):
[0] /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16
[1] /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int8
[2] /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int4
[3] /scratch/mmm9912/models/Qwen2.5-3B-Instruct-fp16
[4] /scratch/mmm9912/models/Qwen2.5-3B-Instruct-int8
[5] /scratch/mmm9912/models/Qwen2.5-3B-Instruct-int4
[6] /scratch/mmm9912/models/Qwen2.5-7B-Instruct-fp16
[7] /scratch/mmm9912/models/Qwen2.5-7B-Instruct-int8
[8] /scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4
[9] /scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16
[10] /scratch/mmm9912/models/Qwen2.5-14B-Instruct-int8
[11] /scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4
