In [None]:
import os
import pandas as pd

# Only allow CUDA device(s); disable CPU fallback
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # or "0,1" for multiple GPUs

# Optional: disable multithreaded CPU ops to avoid CPU fallback paths
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["INFERENCE_ENGINE_DISABLE_CPU"] = "1"

# Set all cache directories explicitly
scratch_root = "/scratch/mmm9912"

os.environ["HF_HOME"] = "/scratch/mmm9912/cache"
os.environ["TORCH_HOME"] = "/scratch/mmm9912/cache/torch"
os.environ["TFHUB_CACHE_DIR"] = "/scratch/mmm9912/cache/tensorflow"
os.environ["XDG_CACHE_HOME"] = "/scratch/mmm9912/cache"
os.environ["HF_DATASETS_CACHE"] = "/scratch/mmm9912/cache/huggingface_datasets"
os.environ["PIP_CACHE_DIR"] = "/scratch/mmm9912/cache/pip"

cache_dir = "/scratch/mmm9912/cache"

gcc_path = "/share/apps/NYUAD5/gcc/9.2.0/bin/gcc"
gcc_dir = os.path.dirname(gcc_path)

os.environ["CC"] = gcc_path
os.environ["CXX"] = f"{gcc_dir}/g++"  # Set C++ compiler as well

print("CC set to:", os.environ["CC"])
print("CXX set to:", os.environ["CXX"])

from transformers import AutoTokenizer, AutoModelForCausalLM
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from vllm import LLM, SamplingParams
import torch

# Blow up if any tensor goes to CPU
torch.set_default_tensor_type(torch.cuda.FloatTensor)

import time
import json
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

torch._dynamo.config.suppress_errors = True

from prompting_utils_local import *

# ─── MEMORY MEASUREMENT FUNCTIONS ─────────────────────────────────────────────
def measure_model_memory(model_path, group_size=256):
    torch.cuda.empty_cache()
    time.sleep(2)
    
    try:
        # 1. Get true baseline
        torch.cuda.reset_peak_memory_stats()
        baseline = torch.cuda.memory_allocated()
        
        # 2. Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            trust_remote_code=True,
            use_fast=False,
            padding_side="left",
            pad_token="<|endoftext|>"
        )
        
        # 3. Temporary config for pure weight measurement
        is_quantized = any(x in model_path for x in ['int4', 'int8'])
        llm_config = {
            "model": model_path,
            "dtype": "float16",
            "quantization": "gptq" if is_quantized else None,
            "tensor_parallel_size": 1,
            "trust_remote_code": True,
            # Set a minimal context length; caching is not required.
            "max_model_len": 256,
            "disable_log_stats": True
        }
        if is_quantized:
            llm_config.update({
                "gptq_group_size": group_size,
                "gptq_desc_act": True
            })
        
        # 4. Load model and measure the weight memory usage
        llm = LLM(**llm_config)
        static_mem = torch.cuda.memory_allocated() - baseline
        
        # 5. Completely disable KV caching by overriding the cache initializer.
        # This ensures no caching memory is allocated.
        if hasattr(llm, "llm_engine") and hasattr(llm.llm_engine, "_init_cache"):
            llm.llm_engine._init_cache = lambda: None
        
        # 6. Measure full inference memory without any cache overhead.
        peak_mem = measure_inference_memory(llm, tokenizer, group_size)
        # With caching disabled, any difference is from inference only.
        kv_cache_mem = peak_mem - static_mem - baseline
        
        # 7. Cleanup
        del llm
        torch.cuda.empty_cache()
        
        return {
            "Model": os.path.basename(model_path),
            "Weights (MB)": static_mem / 1024**2,
            "KV Cache (MB)": kv_cache_mem / 1024**2,
            "Total (MB)": (static_mem + kv_cache_mem) / 1024**2
        }
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# ─── MAIN MEASUREMENT LOOP ────────────────────────────────────────────────────
def main():
    model_paths = [
        "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-1.5B-Instruct-int4",
        "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-3B-Instruct-int4",
        "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-7B-Instruct-int4",
        "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-fp16",
        "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int8",
        "/scratch/mmm9912/models/Qwen2.5-14B-Instruct-int4"
    ]

    results = []
    
    for model_path in model_paths:
        print(f"\n{'='*40}\nProcessing: {model_path}\n{'='*40}")
        torch.cuda.empty_cache()
        time.sleep(2)
        
        try:
            # Load tokenizer with Qwen settings
            tokenizer = AutoTokenizer.from_pretrained(
                model_path,
                trust_remote_code=True,
                use_fast=False,
                padding_side="left",
                pad_token="<|endoftext|>"
            )
            
            # Model configuration for inference;
            # no caching parameters are passed. Caching is disabled by patching _init_cache.
            model_name = os.path.basename(model_path)
            is_quantized = any(x in model_name for x in ['int4', 'int8'])
            
            llm_config = {
                "model": model_path,
                "dtype": "float16",
                "quantization": "gptq" if is_quantized else None,
                "enforce_eager": True,
                "tensor_parallel_size": 1,
                "trust_remote_code": True,
                "max_model_len": 2048,
                "download_dir": cache_dir
            }

            # Load model
            torch.cuda.reset_peak_memory_stats()
            llm = LLM(**llm_config)
            # Disable KV caching by overriding _init_cache with a no-op.
            if hasattr(llm, "llm_engine") and hasattr(llm.llm_engine, "_init_cache"):
                llm.llm_engine._init_cache = lambda: None

            static_mem = torch.cuda.memory_allocated()
            
            # Measure memory usage during inference.
            peak_mem = measure_inference_memory(llm, tokenizer)
            dynamic_mem = peak_mem - static_mem
            
            results.append({
                "Model": model_name,
                "Static (MB)": static_mem / (1024 ** 2),
                "Dynamic (MB)": dynamic_mem / (1024 ** 2),
                "Total Peak (MB)": peak_mem / (1024 ** 2)
            })
            
            del llm
            del tokenizer
            
        except Exception as e:
            print(f"Error: {str(e)}")
            continue

    # Save results
    df = pd.DataFrame(results)
    print("\nMemory Usage Report:")
    print(df.to_string(index=False))
    df.to_csv("memory_results.csv", index=False)

main()
