# 🏥 Medical Model Optimization + Mixture of Experts  
## ⚙️ Notebook 2: Optimization

**Authors:**  
- Dan Harvey  
- Xinzhuo Jiang  

**Affiliation:**  
*High-Performance Machine Learning (HPML)*  
*Columbia University*


---

### 🔍 Project Overview

In this section, we evaluate how quantization affects model load time, memory usage, and inference speed.


### 🎯 Objectives

- Load `Llama-3-8B-UltraMedical` in FP16, 8-bit, and 4-bit quantized formats
- Time each loading operation
- Profile memory usage using `nvidia-smi`
- Compare load time, memory, and performance tradeoffs

We use Hugging Face’s quantization options and BitsAndBytes for INT8/INT4 support.

In [1]:
## 📦 Environment Setup: Dependencies and Imports

import torch
import time
import os
import subprocess
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import gc
import sys
import importlib
!pip install -U bitsandbytes



In [2]:
# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Required packages
required_packages = [
    'torch', 'transformers', 'datasets', 'accelerate', 'flash_attn',
    'evaluate', 'lm_eval', 'sklearn', 'matplotlib', 'wandb',
    'tqdm', 'sentencepiece', 'scipy', 'einops'
]

# Check and install missing packages
for package in required_packages:
    try:
        module = importlib.import_module(package)
        print(f"✅ {package} installed successfully")
        if package == 'torch':
            print(f"   Version: {torch.__version__}")
            print(f"   CUDA available: {torch.cuda.is_available()}")
            if torch.cuda.is_available():
                print(f"   CUDA version: {torch.version.cuda}")
                print(f"   GPU: {torch.cuda.get_device_name(0)}")
        elif hasattr(module, '__version__'):
            print(f"   Version: {module.__version__}")
    except ImportError:
        print(f"❌ {package} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        module = importlib.import_module(package)
        print(f"✅ {package} installed successfully (post-install)")
        if hasattr(module, '__version__'):
            print(f"   Version: {module.__version__}")

# You may need to restart the Kernel to use these



✅ torch installed successfully
   Version: 2.6.0+cu124
   CUDA available: True
   CUDA version: 12.4
   GPU: NVIDIA A100-SXM4-40GB
✅ transformers installed successfully
   Version: 4.51.3
✅ datasets installed successfully
   Version: 3.5.1
✅ accelerate installed successfully
   Version: 1.6.0
✅ flash_attn installed successfully
   Version: 2.7.4.post1
✅ evaluate installed successfully
   Version: 0.4.3
✅ lm_eval installed successfully
✅ sklearn installed successfully
   Version: 1.6.1
✅ matplotlib installed successfully
   Version: 3.10.0
✅ wandb installed successfully
   Version: 0.19.10
✅ tqdm installed successfully
   Version: 4.67.1
✅ sentencepiece installed successfully
   Version: 0.2.0
✅ scipy installed successfully
   Version: 1.15.2
✅ einops installed successfully
   Version: 0.8.1


In [3]:
# Load section dependencies
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc

In [4]:
# 🔐 Hugging Face Access - Llama is Gated
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read)

In [5]:
# Import section dependencies
import platform
import psutil
import distro
import numpy as np

# ==========================
# 🖥️ System & OS Information
# ==========================
system_info = platform.uname()

print("🖥️ System Information")
print("-" * 40)
print(f"Node Name      : {system_info.node}")
print(f"System         : {platform.system()}")
print(f"OS Flavor      : {distro.name()}")
print(f"OS Version     : {distro.version()}")
print(f"Release        : {system_info.release}")
print(f"Architecture   : {platform.machine()}")
print(f"Python Version : {platform.python_version()}")

# =====================
# 🧠 CPU Information
# =====================
cpu_count = psutil.cpu_count(logical=False)
logical_cpu_count = psutil.cpu_count(logical=True)

print("\n🧠 CPU Information")
print("-" * 40)
print(f"Processor      : {system_info.processor or platform.processor()}")
print(f"Physical Cores : {cpu_count}")
print(f"Logical Cores  : {logical_cpu_count}")

# ======================
# 🧠 Memory Information
# ======================
memory_info = psutil.virtual_memory()

print("\n🧠 Memory Information")
print("-" * 40)
print(f"Total RAM      : {memory_info.total / 1024 ** 3:.2f} GB")
print(f"Available RAM  : {memory_info.available / 1024 ** 3:.2f} GB")
print(f"Used RAM       : {memory_info.used / 1024 ** 3:.2f} GB")

# =======================
# 💾 Disk Information
# =======================
disk_info = psutil.disk_usage('/')

print("\n💾 Disk Information")
print("-" * 40)
print(f"Total Space    : {disk_info.total / 1024 ** 3:.2f} GB")
print(f"Used Space     : {disk_info.used / 1024 ** 3:.2f} GB")
print(f"Free Space     : {disk_info.free / 1024 ** 3:.2f} GB")

# =======================
# 🧠 GPU Information
# =======================

print("\n🧠 GPU Info")
print("GPU:", torch.cuda.get_device_name(0))
print("CUDA Available:", True)

🖥️ System Information
----------------------------------------
Node Name      : 62dbed31d1c7
System         : Linux
OS Flavor      : Ubuntu
OS Version     : 22.04
Release        : 6.1.123+
Architecture   : x86_64
Python Version : 3.11.12

🧠 CPU Information
----------------------------------------
Processor      : x86_64
Physical Cores : 6
Logical Cores  : 12

🧠 Memory Information
----------------------------------------
Total RAM      : 83.48 GB
Available RAM  : 80.51 GB
Used RAM       : 2.10 GB

💾 Disk Information
----------------------------------------
Total Space    : 235.68 GB
Used Space     : 63.26 GB
Free Space     : 172.40 GB

🧠 GPU Info
GPU: NVIDIA A100-SXM4-40GB
CUDA Available: True


## 🦙 Llama-3-8B-UltraMedical

**Links**  
- 🤗 [Hugging Face Model Card](https://huggingface.co/TsinghuaC3I/Llama-3-8B-UltraMedical)
- 📄 [Paper / Source](https://huggingface.co/TsinghuaC3I/Llama-3-8B-UltraMedical)

**Approximate GPU Memory Requirements:**
- **FP32**: ~32.4 GB  
- **FP16**: ~48 GB  
- **INT8**: ~24 GB  
- **INT4**: ~12 GB  

> These values are estimates and may vary based on sequence length, attention optimizations, and tokenizer overhead.


### 🦙 Llama-3-8B-UltraMedical

**🔍 FP16 Quantization (float16)**


In [9]:
#Llama-3-8B-UltraMedical

model_name = "TsinghuaC3I/Llama-3-8B-UltraMedical"

# Load tokenizer once (doesn’t affect model loading time)
tokenizer = AutoTokenizer.from_pretrained("TsinghuaC3I/Llama-3-8B-UltraMedical",
    trust_remote_code=True,
    use_auth_token=True
)

model = AutoModelForCausalLM.from_pretrained("TsinghuaC3I/Llama-3-8B-UltraMedical",
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float32,
    use_auth_token=True
)

print("✅ Loaded Llama-3-8B-UltraMedical (FP16, device-mapped)")

print("\n📦 GPU Memory Snapshot:")
print(subprocess.getoutput("nvidia-smi"))

# Cleanup
del model
del tokenizer

gc.collect()
torch.cuda.empty_cache()
time.sleep(5)
gc.collect()
torch.cuda.empty_cache()



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Loaded Llama-3-8B-UltraMedical (FP16, device-mapped)

📦 GPU Memory Snapshot:
Wed May  7 04:45:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             57W /  400W |   33139MiB /  40960MiB |     40%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------

In [11]:
#model_name = "TsinghuaC3I/Llama-3-8B-UltraMedical"

# Load tokenizer once (doesn’t affect model loading time)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_auth_token=True
)

load_times = []

trials = 5

print(f"⏳ Starting timed model loads ({trials} repetitions)...\n")

for i in range(trials):
    start_time = time.monotonic()

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        device_map="auto",
        torch_dtype=torch.float32,
        use_auth_token=True
    )

    elapsed = time.monotonic() - start_time
    load_times.append(elapsed)
    print(f"✅ Run {i + 1}: Loaded in {elapsed:.2f} seconds")

    if i == trials - 1:
        print(f"\n📦 GPU Memory Snapshot:")
        print(subprocess.getoutput("nvidia-smi"))

    # Clean up between runs (free GPU memory)
    del model
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(5)
    gc.collect()
    torch.cuda.empty_cache()

# Summary stats
mean_time = np.mean(load_times)
std_dev_time = np.std(load_times)

print(f"\n📊 {model_name} Load Time Summary (FP32)")
print(f"- Average Load Time: {mean_time:.2f} seconds")
print(f"- Std Dev:           {std_dev_time:.2f} seconds")



⏳ Starting timed model loads (5 repetitions)...





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 1: Loaded in 4.89 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 2: Loaded in 4.82 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 3: Loaded in 4.83 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 4: Loaded in 4.83 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 5: Loaded in 4.81 seconds

📦 GPU Memory Snapshot:
Wed May  7 04:49:33 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             57W /  400W |   33139MiB /  40960MiB |     38%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------

In [14]:
# Benchmark fp16
import random
import json
import wandb
import subprocess
import time
import os
from datetime import datetime

# -----------------------------
# 🧠 Model and Task Config
# -----------------------------
model_name = "meta-llama/Llama-3.2-3B"
task_name = "pubmedqa"
output_base = "./results"

# -----------------------------
# 🚀 Start W&B run
# -----------------------------
run_name = f"{model_name.replace('/', '_')}_{task_name}_5x"
wandb_run = wandb.init(
    project="med-moe-baseline-evals",
    name=run_name,
    config={
        "model": model_name,
        "task": task_name,
        "batch_size": 8,
        "precision": "fp16",
        "eval_method": "lm_eval",
        "repeats": 5
    }
)

# -----------------------------
# 🔁 Run 5x Evaluation Loop
# -----------------------------
for i in range(5):
    print(f"\n🔁 Run {i + 1}/5")

    # Create timestamped output folder
    timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
    run_output_dir = os.path.join(output_base, f"run_{i+1}_{timestamp}")
    os.makedirs(run_output_dir, exist_ok=True)

    # Define lm_eval command
    command = [
        "lm_eval",
        "--model", "hf",
        "--tasks", task_name,
        "--model_args", f"pretrained={model_name},parallelize=True",
        "--device", "cuda:0",
        "--batch_size", "8",
        "--write_out",
        "--output_path", run_output_dir,
        "--trust_remote_code",
        "--confirm_run_unsafe_code"
    ]

    # Start timing
    start_time = time.monotonic()
    result = subprocess.run(command, capture_output=True, text=True)
    elapsed = time.monotonic() - start_time

    print(f"✅ Run {i + 1} completed in {elapsed:.2f} seconds")
    print("STDOUT:\n", result.stdout)

    # -----------------------------
    # 📊 Find and parse result file
    # -----------------------------
    result_file = None
    for fname in os.listdir(run_output_dir):
        if fname.startswith("eval_results") and fname.endswith(".json"):
            result_file = os.path.join(run_output_dir, fname)
            break

    if result_file is None:
        print(f"❌ No eval_results_*.json found in {run_output_dir}")
        continue


    wandb_run.log({

        f"{task_name}/eval_time_sec": elapsed,
        "run_index": i + 1
    })
    print(f"📈 Logged to W&B: acc={acc:.3f}, stderr={stderr:.4f}")


# -----------------------------
# ✅ Finish W&B run
# -----------------------------
wandb_run.finish()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdyh2111[0m ([33mmed-moe[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



🔁 Run 1/5
✅ Run 1 completed in 67.19 seconds
STDOUT:
 hf (pretrained=meta-llama/Llama-3.2-3B,parallelize=True,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 8
| Tasks  |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|--------|------:|------|-----:|------|---|----:|---|-----:|
|pubmedqa|      1|none  |     0|acc   |↑  |0.732|±  |0.0198|


❌ No eval_results_*.json found in ./results/run_1_2025-05-07T04-57-45

🔁 Run 2/5
✅ Run 2 completed in 38.86 seconds
STDOUT:
 hf (pretrained=meta-llama/Llama-3.2-3B,parallelize=True,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 8
| Tasks  |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
|--------|------:|------|-----:|------|---|----:|---|-----:|
|pubmedqa|      1|none  |     0|acc   |↑  |0.732|±  |0.0198|


❌ No eval_results_*.json found in ./results/run_2_2025-05-07T04-58-52

🔁 Run 3/5
✅ Run 3 completed in 38.73 seconds
STDOUT:
 hf (pretrained=meta-llama/Llama

FP8

In [7]:
model_name = "TsinghuaC3I/Llama-3-8B-UltraMedical"

from transformers import BitsAndBytesConfig

# 8-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# Load tokenizer once (doesn’t affect model loading time)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_auth_token=True
)

load_times = []

trials = 5

print(f"⏳ Starting timed model loads ({trials} repetitions)...\n")

for i in range(trials):
    start_time = time.monotonic()

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        device_map="auto",
        quantization_config=bnb_config,
        use_auth_token=True
    )

    elapsed = time.monotonic() - start_time
    load_times.append(elapsed)
    print(f"✅ Run {i + 1}: Loaded in {elapsed:.2f} seconds")

    if i == trials - 1:
        print(f"\n📦 GPU Memory Snapshot:")
        print(subprocess.getoutput("nvidia-smi"))

    # Clean up between runs (free GPU memory)
    del model
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(5)
    gc.collect()
    torch.cuda.empty_cache()

# Summary stats
mean_time = np.mean(load_times)
std_dev_time = np.std(load_times)

print(f"\n📊 {model_name} Load Time Summary (FP8)")
print(f"- Average Load Time: {mean_time:.2f} seconds")
print(f"- Std Dev:           {std_dev_time:.2f} seconds")

⏳ Starting timed model loads (5 repetitions)...



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 1: Loaded in 17.12 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 2: Loaded in 17.23 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 3: Loaded in 17.26 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 4: Loaded in 17.08 seconds


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Run 5: Loaded in 17.29 seconds

📦 GPU Memory Snapshot:
Wed May  7 05:12:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             57W /  400W |   10393MiB /  40960MiB |     22%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+---------------

In [9]:
# Benchmark fp16
import random
import json
import wandb
import subprocess
import time
import os
from datetime import datetime

# -----------------------------
# 🧠 Model and Task Config
# -----------------------------
model_name = "meta-llama/Llama-3.2-3B"
task_name = "pubmedqa"
output_base = "./results"

# 8-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# -----------------------------
# 🚀 Start W&B run
# -----------------------------
run_name = f"{model_name.replace('/', '_')}_{task_name}_fp8_5x"
wandb_run = wandb.init(
    project="med-moe-baseline-evals",
    name=run_name,
    config={
        "model": model_name,
        "task": task_name,
        "batch_size": 8,
        "precision": "fp8",
        "eval_method": "lm_eval",
        "repeats": 5
    }
)

# -----------------------------
# 🔁 Run 5x Evaluation Loop
# -----------------------------
for i in range(5):
    print(f"\n🔁 Run {i + 1}/5")

    # Create timestamped output folder
    timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
    run_output_dir = os.path.join(output_base, f"run_{i+1}_{timestamp}")
    os.makedirs(run_output_dir, exist_ok=True)

    # Define lm_eval command
    command = [
        "lm_eval",
        "--model", "hf",
        "--tasks", task_name,
        "--model_args", f"pretrained={model_name},load_in_8bit=True,use_accelerate=True",
        "--device", "cuda:0",
        "--batch_size", "8",
        "--write_out",
        "--output_path", run_output_dir,
        "--trust_remote_code",
        "--confirm_run_unsafe_code"
    ]

    # Start timing
    start_time = time.monotonic()
    result = subprocess.run(command, capture_output=True, text=True)
    elapsed = time.monotonic() - start_time

    print(f"✅ Run {i + 1} completed in {elapsed:.2f} seconds")
    print("STDOUT:\n", result.stdout)

    # -----------------------------
    # 📊 Find and parse result file
    # -----------------------------
    result_file = None
    for fname in os.listdir(run_output_dir):
        if fname.startswith("eval_results") and fname.endswith(".json"):
            result_file = os.path.join(run_output_dir, fname)
            break

    if result_file is None:
        print(f"❌ No eval_results_*.json found in {run_output_dir}")
        continue


    wandb_run.log({

        f"{task_name}/eval_time_sec": elapsed,
        "run_index": i + 1
    })
    print(f"📈 Logged to W&B: acc={acc:.3f}, stderr={stderr:.4f}")


# -----------------------------
# ✅ Finish W&B run
# -----------------------------
wandb_run.finish()



🔁 Run 1/5
✅ Run 1 completed in 24.00 seconds
STDOUT:
 
❌ No eval_results_*.json found in ./results/run_1_2025-05-07T05-18-20

🔁 Run 2/5


KeyboardInterrupt: 

4 bit