In [1]:
import torch
from model_loader import ShardedFP8ModelLoader
import time
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Helper function to print memory usage

def print_memory_usage():
    allocated = torch.cuda.memory_allocated()
    reserved = torch.cuda.memory_reserved()
    print(f"Memory Allocated: {allocated / 1e6:.2f} MB")
    print(f"Memory Reserved: {reserved / 1e6:.2f} MB")

In [3]:
# Initialize FP16 Loader

fp16_model_dir = "/home/jz3607/.llama/checkpoints/Llama3.1-8B-Instruct"
start_time = time.time()
fp16_loader = ShardedFP8ModelLoader(
    model_dir=fp16_model_dir,
    device_ids=[0, 1],
    memory_efficient=True,
    fp8_format=None  # None for FP16
)
fp16_loader.load_model()  # Replace with the actual model loading method
fp16_time = time.time() - start_time

print(f"FP16 Model Loading Time: {fp16_time:.2f} seconds")
print("FP16 Memory Usage:")
print_memory_usage()

Initializing ByteVerification...
Found 0 shards: []
Error loading model: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 21.95 GiB of which 40.12 MiB is free. Including non-PyTorch memory, this process has 21.90 GiB memory in use. Of the allocated memory 21.72 GiB is allocated by PyTorch, and 1.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 21.95 GiB of which 40.12 MiB is free. Including non-PyTorch memory, this process has 21.90 GiB memory in use. Of the allocated memory 21.72 GiB is allocated by PyTorch, and 1.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Initialize FP8 Loader
fp8_model_dir = "./Meta-Llama-3.1-8B-Instruct-FP8"
start_time = time.time()
fp8_loader = ShardedFP8ModelLoader(
    model_dir=fp8_model_dir,
    device_ids=[0, 1],
    memory_efficient=True,
    fp8_format=FP8Format(e4m3=True)
)
fp8_loader.load_model()  # Replace with the actual model loading method
fp8_time = time.time() - start_time

print(f"FP8 Model Loading Time: {fp8_time:.2f} seconds")
print("FP8 Memory Usage:")
print_memory_usage()

In [None]:
print(f"FP16 Loading Time: {fp16_time:.2f} seconds")
print(f"FP8 Loading Time: {fp8_time:.2f} seconds")

In [None]:
# Run a forward pass for both FP16 and FP8
input_data = torch.randn(1, 3, 224, 224).to('cuda')  # Example input tensor

# FP16 Forward Pass
fp16_output = fp16_loader.model(input_data)

# FP8 Forward Pass
fp8_output = fp8_loader.model(input_data)

# Visualize differences
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(fp16_output.flatten().cpu().detach().numpy(), label="FP16 Output")
plt.plot(fp8_output.flatten().cpu().detach().numpy(), label="FP8 Output")
plt.legend()
plt.title("Output Comparison between FP16 and FP8")
plt.show()

In [None]:
results = {
    "Precision": ["FP16", "FP8"],
    "Loading Time (s)": [fp16_time, fp8_time],
    "Memory Allocated (MB)": [
        torch.cuda.memory_allocated(device=0) / 1e6,
        torch.cuda.memory_allocated(device=1) / 1e6
    ],
    "Memory Reserved (MB)": [
        torch.cuda.memory_reserved(device=0) / 1e6,
        torch.cuda.memory_reserved(device=1) / 1e6
    ]
}

df_results = pd.DataFrame(results)
print(df_results)

In [None]:
# Plot comparison

df_results.plot(x="Precision", y=["Loading Time (s)", "Memory Allocated (MB)", "Memory Reserved (MB)"], kind="bar", figsize=(12, 6))
plt.title("FP16 vs FP8 Comparison")
plt.ylabel("Value")
plt.show()