In [1]:
import os
import pandas as pd

# Only allow CUDA device(s); disable CPU fallback
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # or "0,1" for multiple GPUs

# Optional: disable multithreaded CPU ops to avoid CPU fallback paths
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["INFERENCE_ENGINE_DISABLE_CPU"] = "1"

# Set all cache directories explicitly
scratch_root = "/scratch/mmm9912"

os.environ["HF_HOME"] = "/scratch/mmm9912/cache"
os.environ["TORCH_HOME"] = "/scratch/mmm9912/cache/torch"
os.environ["TFHUB_CACHE_DIR"] = "/scratch/mmm9912/cache/tensorflow"
os.environ["XDG_CACHE_HOME"] = "/scratch/mmm9912/cache"
os.environ["HF_DATASETS_CACHE"] = "/scratch/mmm9912/cache/huggingface_datasets"
os.environ["PIP_CACHE_DIR"] = "/scratch/mmm9912/cache/pip"

cache_dir = "/scratch/mmm9912/cache"

gcc_path = "/share/apps/NYUAD5/gcc/9.2.0/bin/gcc"
gcc_dir = os.path.dirname(gcc_path)

os.environ["CC"] = gcc_path
os.environ["CXX"] = f"{gcc_dir}/g++"  # Set C++ compiler as well

print("CC set to:", os.environ["CC"])
print("CXX set to:", os.environ["CXX"])

from transformers import AutoTokenizer, AutoModelForCausalLM
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from vllm import LLM, SamplingParams
import torch

# Blow up if any tensor goes to CPU
torch.set_default_tensor_type(torch.cuda.FloatTensor)

import time
import json
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

torch._dynamo.config.suppress_errors = True

from prompting_utils_local import *

# ─── MEMORY MEASUREMENT FUNCTIONS ─────────────────────────────────────────────
def measure_model_memory(model_path, group_size=256):
    torch.cuda.empty_cache()
    time.sleep(2)
    
    try:
        # 1. Get true baseline
        torch.cuda.reset_peak_memory_stats()
        baseline = torch.cuda.memory_allocated()
        
        # 2. Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            trust_remote_code=True,
            use_fast=False,
            padding_side="left",
            pad_token="<|endoftext|>"
        )
        
        # 3. Temporary config for pure weight measurement
        is_quantized = any(x in model_path for x in ['int4', 'int8'])
        llm_config = {
            "model": model_path,
            "dtype": "float16",
            "quantization": "gptq" if is_quantized else None,
            "tensor_parallel_size": 1,
            "trust_remote_code": True,
            # Set a minimal context length; caching is not required.
            "max_model_len": 256,
            "disable_log_stats": True
        }
        if is_quantized:
            llm_config.update({
                "gptq_group_size": group_size,
                "gptq_desc_act": True
            })
        
        # 4. Load model and measure the weight memory usage
        llm = LLM(**llm_config)
        static_mem = torch.cuda.memory_allocated() - baseline
        
        # 5. Completely disable KV caching by overriding the cache initializer.
        # This ensures no caching memory is allocated.
        if hasattr(llm, "llm_engine") and hasattr(llm.llm_engine, "_init_cache"):
            llm.llm_engine._init_cache = lambda: None
        
        # 6. Measure full inference memory without any cache overhead.
        peak_mem = measure_inference_memory(llm, tokenizer, group_size)
        # With caching disabled, any difference is from inference only.
        kv_cache_mem = peak_mem - static_mem - baseline
        
        # 7. Cleanup
        del llm
        torch.cuda.empty_cache()
        
        return {
            "Model": os.path.basename(model_path),
            "Weights (MB)": static_mem / 1024**2,
            "KV Cache (MB)": kv_cache_mem / 1024**2,
            "Total (MB)": (static_mem + kv_cache_mem) / 1024**2
        }
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# ─── MAIN MEASUREMENT LOOP ────────────────────────────────────────────────────
def main():
    model_paths = [
        "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int4",
        "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int4",
        "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4",
        "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4"
    ]

    results = []
    
    for model_path in model_paths:
        print(f"\n{'='*40}\nProcessing: {model_path}\n{'='*40}")
        torch.cuda.empty_cache()
        time.sleep(2)
        
        try:
            # Load tokenizer with Qwen settings
            tokenizer = AutoTokenizer.from_pretrained(
                model_path,
                trust_remote_code=True,
                use_fast=False,
                padding_side="left",
                pad_token="<|endoftext|>"
            )
            
            # Model configuration for inference;
            # no caching parameters are passed. Caching is disabled by patching _init_cache.
            model_name = os.path.basename(model_path)
            is_quantized = any(x in model_name for x in ['int4', 'int8'])
            
            llm_config = {
                "model": model_path,
                "dtype": "float16",
                "quantization": "gptq" if is_quantized else None,
                "enforce_eager": True,
                "tensor_parallel_size": 1,
                "trust_remote_code": True,
                "max_model_len": 2048,
                "download_dir": cache_dir
            }

            # Load model
            torch.cuda.reset_peak_memory_stats()
            llm = LLM(**llm_config)
            # Disable KV caching by overriding _init_cache with a no-op.
            if hasattr(llm, "llm_engine") and hasattr(llm.llm_engine, "_init_cache"):
                llm.llm_engine._init_cache = lambda: None

            static_mem = torch.cuda.memory_allocated()
            
            # Measure memory usage during inference.
            peak_mem = measure_inference_memory(llm, tokenizer)
            dynamic_mem = peak_mem - static_mem
            
            results.append({
                "Model": model_name,
                "Static (MB)": static_mem / (1024 ** 2),
                "Dynamic (MB)": dynamic_mem / (1024 ** 2),
                "Total Peak (MB)": peak_mem / (1024 ** 2)
            })
            
            del llm
            del tokenizer
            
        except Exception as e:
            print(f"Error: {str(e)}")
            continue

    # Save results
    df = pd.DataFrame(results)
    print("\nMemory Usage Report:")
    print(df.to_string(index=False))
    df.to_csv("memory_results.csv", index=False)

main()


CC set to: /share/apps/NYUAD5/gcc/9.2.0/bin/gcc
CXX set to: /share/apps/NYUAD5/gcc/9.2.0/bin/g++


  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
  _C._set_default_tensor_type(t)



Processing: /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16
INFO 04-15 06:04:50 __init__.py:207] Automatically detected platform cuda.
INFO 04-15 06:04:58 config.py:549] This model supports multiple tasks: {'score', 'embed', 'generate', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 04-15 06:04:58 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16', speculative_config=None, tokenizer='/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir='/scratch/mmm9912/cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), ob

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 06:05:03 model_runner.py:1115] Loading model weights took 2.8875 GB
INFO 04-15 06:05:05 worker.py:267] Memory profiling takes 1.03 seconds
INFO 04-15 06:05:05 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 06:05:05 worker.py:267] model weights take 2.89GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 66.87GiB.
INFO 04-15 06:05:05 executor_base.py:111] # cuda blocks: 156503, # CPU blocks: 9362
INFO 04-15 06:05:05 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 1222.68x
INFO 04-15 06:05:08 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 4.73 seconds
Error: name 'measure_inference_memory' is not defined

Processing: /scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int8
INFO 04-15 06:05:13 config.py:549] This model supports multiple tasks: {'score', 'embed', 'generate

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 06:05:17 model_runner.py:1115] Loading model weights took 1.6816 GB
INFO 04-15 06:05:18 worker.py:267] Memory profiling takes 0.57 seconds
INFO 04-15 06:05:18 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 06:05:18 worker.py:267] model weights take 1.68GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.38GiB; the rest of the memory reserved for KV Cache is 68.17GiB.
INFO 04-15 06:05:19 executor_base.py:111] # cuda blocks: 159554, # CPU blocks: 9362
INFO 04-15 06:05:19 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 1246.52x
Error: CUDA out of memory. Tried to allocate 2.44 GiB. GPU 0 has a total capacity of 79.15 GiB of which 2.12 GiB is free. Including non-PyTorch memory, this process has 77.01 GiB memory in use. Of the allocated memory 76.33 GiB is allocated by PyTorch, and 183.11 MiB is reserved by PyTorch but unallocated. If reserved but una

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 06:05:24 model_runner.py:1115] Loading model weights took 1.0855 GB
INFO 04-15 06:05:25 worker.py:267] Memory profiling takes 0.54 seconds
INFO 04-15 06:05:25 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 06:05:25 worker.py:267] model weights take 1.09GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.38GiB; the rest of the memory reserved for KV Cache is 68.77GiB.
INFO 04-15 06:05:26 executor_base.py:111] # cuda blocks: 160954, # CPU blocks: 9362
INFO 04-15 06:05:26 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 1257.45x
Error: CUDA out of memory. Tried to allocate 2.46 GiB. GPU 0 has a total capacity of 79.15 GiB of which 343.25 MiB is free. Including non-PyTorch memory, this process has 78.79 GiB memory in use. Of the allocated memory 78.23 GiB is allocated by PyTorch, and 60.77 MiB is reserved by PyTorch but unallocated. If reserved but un

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 04-15 06:05:33 model_runner.py:1115] Loading model weights took 5.7837 GB
INFO 04-15 06:05:34 worker.py:267] Memory profiling takes 0.56 seconds
INFO 04-15 06:05:34 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 06:05:34 worker.py:267] model weights take 5.78GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 64.07GiB.
INFO 04-15 06:05:35 executor_base.py:111] # cuda blocks: 116630, # CPU blocks: 7281
INFO 04-15 06:05:35 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 911.17x
Error: CUDA out of memory. Tried to allocate 1.78 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.07 GiB is free. Including non-PyTorch memory, this process has 78.05 GiB memory in use. Of the allocated memory 77.34 GiB is allocated by PyTorch, and 218.16 MiB is reserved by PyTorch but unallocated. If reserved but unal

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 06:05:41 model_runner.py:1115] Loading model weights took 3.2509 GB
INFO 04-15 06:05:42 worker.py:267] Memory profiling takes 0.57 seconds
INFO 04-15 06:05:42 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 06:05:42 worker.py:267] model weights take 3.25GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 66.60GiB.
INFO 04-15 06:05:43 executor_base.py:111] # cuda blocks: 121241, # CPU blocks: 7281
INFO 04-15 06:05:43 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 947.20x
Error: CUDA out of memory. Tried to allocate 1.85 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.73 GiB is free. Including non-PyTorch memory, this process has 77.40 GiB memory in use. Of the allocated memory 76.73 GiB is allocated by PyTorch, and 173.36 MiB is reserved by PyTorch but unallocated. If reserved but unal

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 06:05:49 model_runner.py:1115] Loading model weights took 1.9277 GB
INFO 04-15 06:05:50 worker.py:267] Memory profiling takes 0.56 seconds
INFO 04-15 06:05:50 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 06:05:50 worker.py:267] model weights take 1.93GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 67.92GiB.
INFO 04-15 06:05:50 executor_base.py:111] # cuda blocks: 123649, # CPU blocks: 7281
INFO 04-15 06:05:50 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 966.01x
Error: CUDA out of memory. Tried to allocate 1.89 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.12 GiB is free. Including non-PyTorch memory, this process has 78.00 GiB memory in use. Of the allocated memory 77.37 GiB is allocated by PyTorch, and 141.01 MiB is reserved by PyTorch but unallocated. If reserved but unal

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 06:06:05 model_runner.py:1115] Loading model weights took 8.2107 GB
Error: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 79.15 GiB of which 9.25 MiB is free. Including non-PyTorch memory, this process has 79.12 GiB memory in use. Of the allocated memory 78.17 GiB is allocated by PyTorch, and 453.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Processing: /scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4
INFO 04-15 06:06:08 config.py:549] This model supports multiple tasks: {'score', 'embed', 'generate', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 04-15 06:06:08 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4', specula

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 06:06:15 model_runner.py:1115] Loading model weights took 5.1396 GB
INFO 04-15 06:06:16 worker.py:267] Memory profiling takes 0.64 seconds
INFO 04-15 06:06:16 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.90) = 71.24GiB
INFO 04-15 06:06:16 worker.py:267] model weights take 5.14GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 64.70GiB.
INFO 04-15 06:06:17 executor_base.py:111] # cuda blocks: 75720, # CPU blocks: 4681
INFO 04-15 06:06:17 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 591.56x
Error: CUDA out of memory. Tried to allocate 2.31 GiB. GPU 0 has a total capacity of 79.15 GiB of which 1.06 GiB is free. Including non-PyTorch memory, this process has 78.07 GiB memory in use. Of the allocated memory 77.23 GiB is allocated by PyTorch, and 349.86 MiB is reserved by PyTorch but unallocated. If reserved but unall