In [9]:
!pip3 install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [21]:
import argparse
import sys
from tabulate import tabulate
from datetime import datetime


def main():
    parser = argparse.ArgumentParser(
        description="Estimate LLM memory, capacity, and latency with optional quantization"
    )
    parser.add_argument(
        "--num_gpu", type=int, default=1,
        help="Number of GPUs"
    )
    parser.add_argument(
        "--prompt_size", type=int, default=2000,
        help="Prompt size in tokens"
    )
    parser.add_argument(
        "--response_size", type=int, default=1000,
        help="Response size in tokens"
    )
    parser.add_argument(
        "--n_concurrent_request", type=int, default=1,
        help="Number of concurrent requests"
    )
    parser.add_argument(
        "--quantization", choices=["fp16", "int8"], default="fp16",
        help="Quantization precision (fp16 or int8)"
    )
    args, _ = parser.parse_known_args()

    num_gpu = args.num_gpu
    prompt_size = args.prompt_size
    response_size = args.response_size
    n_concurrent_request = args.n_concurrent_request
    quantization = args.quantization

    bytes_per_param = 2 if quantization == "fp16" else 1

    print(f"Quantization: {quantization}")
    print(f" num_gpu = {num_gpu}, prompt_size = {prompt_size} tokens, response_size = {response_size} tokens")
    print(f" n_concurrent_request = {n_concurrent_request}\n")

    # Define vGPU specs with physical GPU memory to scale performance
    gpu_specs = [
        {"name": "A40-12Q", "fp16_tflops": 299, "memory_gb": 12, "phy_memory_gb": 48, "bandwidth_gbps": 696},
        {"name": "A40-24Q", "fp16_tflops": 299, "memory_gb": 24, "phy_memory_gb": 48, "bandwidth_gbps": 696},
        {"name": "A40-48Q", "fp16_tflops": 299, "memory_gb": 48, "phy_memory_gb": 48, "bandwidth_gbps": 696},
        {"name": "L40-12Q", "fp16_tflops": 362, "memory_gb": 12, "phy_memory_gb": 48, "bandwidth_gbps": 864},
        {"name": "L40-24Q", "fp16_tflops": 362, "memory_gb": 24, "phy_memory_gb": 48, "bandwidth_gbps": 864},
        {"name": "L40-48Q", "fp16_tflops": 362, "memory_gb": 48, "phy_memory_gb": 48, "bandwidth_gbps": 864},
        {"name": "L40S-12Q","fp16_tflops": 366, "memory_gb": 12, "phy_memory_gb": 48, "bandwidth_gbps": 864},
        {"name": "L40S-24Q","fp16_tflops": 366, "memory_gb": 24, "phy_memory_gb": 48, "bandwidth_gbps": 864},
        {"name": "L40S-48Q","fp16_tflops": 366, "memory_gb": 48, "phy_memory_gb": 48, "bandwidth_gbps": 864},
    ]

    model_specs = [
        {"name": "Llama-3-8B",   "params_billion": 8,   "d_model": 4096,   "n_layers": 32},
        {"name": "Llama-3-70B",  "params_billion": 70,  "d_model": 8192,   "n_layers": 80},
        {"name": "Llama-3.1-8B", "params_billion": 8,   "d_model": 4096,   "n_layers": 32},
        {"name": "Llama-3.1-70B","params_billion": 70,  "d_model": 8192,   "n_layers": 80},
        {"name": "Mistral-7B",   "params_billion": 7,   "d_model": 4096,   "n_layers": 32},
        {"name": "Falcon-7B",    "params_billion": 7,   "d_model": 4544,   "n_layers": 32},
        {"name": "Falcon-40B",   "params_billion": 40,  "d_model": 8192,   "n_layers": 60},
        {"name": "Falcon-180B",  "params_billion": 180, "d_model": 14848,  "n_layers": 80},
        {"name": "Qwen-14B",     "params_billion": 14,  "d_model": 5120,   "n_layers": 40},
    ]

    BYTES_IN_GB = 1_073_741_824

    def calc_kv_cache_size_per_token(n_layers, d_model):
        elem_size = 1 if quantization == "int8" else 2
        return 2 * elem_size * n_layers * d_model / BYTES_IN_GB

    def calc_memory_footprint(model, concurrent, context):
        kv_size = calc_kv_cache_size_per_token(model["n_layers"], model["d_model"])
        return kv_size * context * concurrent + model["params_billion"] * bytes_per_param

    def calc_kv_cache_tokens(num_gpu, gpu_mem, params_billion, kv_size):
        available = num_gpu * gpu_mem - params_billion * bytes_per_param
        return max(available / kv_size, 0)

    # Scale compute and bandwidth by profile fraction
    def effective_flops(fp16_tflops, mem, phy_mem):
        return fp16_tflops * (mem / phy_mem)

    def effective_bandwidth(bandwidth, mem, phy_mem):
        return bandwidth * (mem / phy_mem)

    def calc_prefill_time(params_billion, gpu):
        flops_eff = effective_flops(gpu["fp16_tflops"], gpu["memory_gb"], gpu["phy_memory_gb"])
        return (params_billion * bytes_per_param) / flops_eff / num_gpu

    def calc_tpot(params_billion, gpu):
        bw_eff = effective_bandwidth(gpu["bandwidth_gbps"], gpu["memory_gb"], gpu["phy_memory_gb"])
        return (params_billion * bytes_per_param) / bw_eff / num_gpu * 1000

    def calc_e2e(prefill, tpot, in_size, out_size):
        return (in_size * prefill + out_size * tpot) / 1000

    context_window = prompt_size + response_size

    # Memory Footprint
    print("\n=== Memory Footprint ===")
    mem_tbl = []
    for m in model_specs:
        mf = calc_memory_footprint(m, n_concurrent_request, context_window)
        kv = calc_kv_cache_size_per_token(m["n_layers"], m["d_model"])
        mem_tbl.append({
            "Model": m["name"],
            "Memory Footprint (GB)": f"{mf:.2f}",
            "KV Size/token (GiB)": f"{kv:.6f}" }
        )
    print(tabulate(mem_tbl, headers="keys", tablefmt="orgtbl"))

    # OOM Warnings
    print("\n=== OOM Warnings ===")
    for m in model_specs:
        for g in gpu_specs:
            mf = calc_memory_footprint(m, n_concurrent_request, context_window)
            available = num_gpu * g["memory_gb"]
            if mf > available:
                max_req = int(calc_kv_cache_tokens(
                    num_gpu, g["memory_gb"], m["params_billion"],
                    calc_kv_cache_size_per_token(m["n_layers"], m["d_model"])) 
                    // context_window)
                print(f"!!!! Warning {m['name']} with {g['name']}: concurrent_requests={n_concurrent_request} causes OOM")
                print(f"Max concurrent_requests: {max_req} (context={context_window} tokens)")

    # Capacity & Latency
    print("\n=== Capacity & Latency ===")
    cap_tbl = []
    for m in model_specs:
        kv = calc_kv_cache_size_per_token(m["n_layers"], m["d_model"])
        for g in gpu_specs:
            max_kv = calc_kv_cache_tokens(num_gpu, g["memory_gb"], m["params_billion"], kv)
            pre = calc_prefill_time(m["params_billion"], g)
            tpot = calc_tpot(m["params_billion"], g)
            if any(isinstance(x, str) for x in (pre, tpot)):
                e2e = throughput = ttft = "OOM"
            else:
                ttft = pre + tpot / 1000
                e2e = calc_e2e(pre, tpot, prompt_size, response_size)
                throughput = response_size / e2e if e2e > 0 else "OOM"
            cap_tbl.append({
                "Model": m["name"],
                "GPU": g["name"],
                "Max KV Tokens": int(max_kv),
                "TTFT (s)": f"{ttft:.3f}",
                "E2E Lat (s)": f"{e2e:.2f}" if isinstance(e2e, float) else e2e,
                "Throughput (tok/s)": f"{throughput:.2f}"
            })
    print(tabulate(cap_tbl, headers="keys", tablefmt="orgtbl"))


if __name__ == '__main__':
    main()


Quantization: fp16
 num_gpu = 1, prompt_size = 2000 tokens, response_size = 1000 tokens
 n_concurrent_request = 1


=== Memory Footprint ===
| Model         |   Memory Footprint (GB) |   KV Size/token (GiB) |
|---------------+-------------------------+-----------------------|
| Llama-3-8B    |                   17.46 |              0.000488 |
| Llama-3-70B   |                  147.32 |              0.002441 |
| Llama-3.1-8B  |                   17.46 |              0.000488 |
| Llama-3.1-70B |                  147.32 |              0.002441 |
| Mistral-7B    |                   15.46 |              0.000488 |
| Falcon-7B     |                   15.63 |              0.000542 |
| Falcon-40B    |                   85.49 |              0.001831 |
| Falcon-180B   |                  373.28 |              0.004425 |
| Qwen-14B      |                   30.29 |              0.000763 |

Max concurrent_requests: 0 (context=3000 tokens)
Max concurrent_requests: 0 (context=3000 tokens)
Max concurre

In [2]:
import re
from difflib import get_close_matches

# Define valid options
VALID_MODELS = [
    "Llama-3-8B", "Llama-3-70B", "Llama-3.1-8B", "Llama-3.1-70B",
    "Mistral-7B", "Falcon-7B", "Falcon-40B", "Falcon-180B", "Qwen-14B"
]
VALID_PRECISIONS = ["fp16", "int8"]

def parse_vgpu_query(query: str) -> dict:
    """Parse a natural language vGPU configuration query."""
    result = {
        "Workload": None,
        "Model": None,
        "Concurrent Users": None,
        "Precision": None
    }
    
    # workload detection 
    for workload in ["RAG", "LLM Inference", "Inference"]:
        if re.search(rf"\b{re.escape(workload)}\b", query, re.IGNORECASE):
            result["Workload"] = workload
            break
    


    # 1) Explicit model mention
    for model in VALID_MODELS:
        if re.search(rf"\b{re.escape(model)}\b", query, re.IGNORECASE):
            result["Model"] = model
            break
    


    # 2) Size-based fallback mapping
    if not result["Model"]:
        # small: <7b parameters
        if re.search(r"<\s*7\s*[bB]", query) or re.search(r"\bsmall\b", query, re.IGNORECASE):
            result["Model"] = "Mistral-7B"
        # medium: >=7b and <=14b parameters or 'medium' keyword
        elif re.search(r"\bmedium\b", query, re.IGNORECASE):
            result["Model"] = "Llama-3-8B"
        # large: >14b parameters or 'large' keyword
        elif re.search(r"\blarge\b", query, re.IGNORECASE):
            if re.search(r"\bextra\b", query, re.IGNORECASE):
                result["Model"] = "Llama-3.1-70B"
            else:
                result["Model"] = "Falcon-40B"
    
    # 3) Concurrent users
    user_match = re.search(r"(\d+)\s*(?:concurrent|simultaneous)?\s*users?", query, re.IGNORECASE)
    if user_match:
        result["Concurrent Users"] = int(user_match.group(1))
    
    # 4) Precision
    prec_match = re.search(r"\b(fp16|int8)\b", query, re.IGNORECASE)
    if prec_match:
        precision = prec_match.group(1).lower()
        if precision in VALID_PRECISIONS:
            result["Precision"] = precision
    
    # 5) Default precision if not specified
    if not result["Precision"]:
        result["Precision"] = "fp16"
    if not result["Model"]:
        result["Model"] = "Llama-3-8B"
    if not result["Concurrent Users"]:
        result["Concurrent Users"] = 1
    
    return result

# Example usage
queries = [
    "I need a vGPU configuration for RAG using available GPU inventory: 1x NVIDIA L40S running Llama-3-8B using embedding model nvidia/nvolveqa-embed-large-1B with FP16 precision.",
    "I need a vGPU configuration for RAG with small (< 7b parameters) using available GPU inventory: 1x NVIDIA L40S using embedding model nvidia/nvolveqa-embed-large-1B using triton with FP16 precision.",
    "Please set up a RAG pipeline for large models with INT8 precision for 3 simultaneous users.",
    "I need a vGPU configuration for RAG with extra large models using 2 concurrent users and int8 precision.",
    "I need a vGPU configuration for RAG using available GPU inventory: 1x NVIDIA L40S running Llama-3.1-70B using embedding model nvidia/nvolveqa-embed-large-1B with FP16 precision.",


]

for q in queries:
    print(q)
    print(parse_vgpu_query(q))
    print()


I need a vGPU configuration for RAG using available GPU inventory: 1x NVIDIA L40S running Llama-3-8B using embedding model nvidia/nvolveqa-embed-large-1B with FP16 precision.
{'Workload': 'RAG', 'Model': 'Llama-3-8B', 'Concurrent Users': 1, 'Precision': 'fp16'}

I need a vGPU configuration for RAG with small (< 7b parameters) using available GPU inventory: 1x NVIDIA L40S using embedding model nvidia/nvolveqa-embed-large-1B using triton with FP16 precision.
{'Workload': 'RAG', 'Model': 'Mistral-7B', 'Concurrent Users': 1, 'Precision': 'fp16'}

Please set up a RAG pipeline for large models with INT8 precision for 3 simultaneous users.
{'Workload': 'RAG', 'Model': 'Falcon-40B', 'Concurrent Users': 3, 'Precision': 'int8'}

I need a vGPU configuration for RAG with extra large models using 2 concurrent users and int8 precision.
{'Workload': 'RAG', 'Model': 'Llama-3.1-70B', 'Concurrent Users': 2, 'Precision': 'int8'}

I need a vGPU configuration for RAG using available GPU inventory: 1x NVIDI