-
Notifications
You must be signed in to change notification settings - Fork 341
Description
Description
When running a script to observe GPU utilization across different target backends available for CUDA-Q, I noticed that GPU memory utilization is always capped at 50% of the available GPU memory in case tensornet. This occurs regardless of the number of qubits or shots. I suspect this may be due to an in-built memory cap or allocation setting.
I would like to confirm:
- If this 50% GPU memory usage cap is a default behavior of CUDA-Q or its underlying libraries.
- Whether this cap can be overridden to enable the utilization of the full GPU memory.
Below is the script used for testing, along with the environment configuration and relevant package versions.
import cudaq
import numpy as np
import random
import time
import os
import psutil
from tqdm import tqdm # For progress bar
import json # For saving results
import sys
# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
# Function to restart the kernel
# def restart_kernel():
# process = psutil.Process(os.getpid())
# for handle in process.open_files() + process.connections():
# try:
# os.close(handle.fd)
# except Exception:
# pass
# os.execl(sys.executable, sys.executable, *sys.argv)
# CUDAq targets to test
targets = [
# {"name": "nvidia_fp32", "target": "nvidia", "option": "fp32"},
# {"name": "nvidia_fp64", "target": "nvidia", "option": "fp64"},
# {"name": "nvidia_mgpu_fp64", "target": "nvidia", "option": "mgpu,fp64"},
{"name": "tensornet", "target": "tensornet"}
# {"name": "tensornet_mps", "target": "tensornet-mps"},
# {"name": "qpp_cpu", "target": "qpp-cpu"},
]
# Function to generate a random Clifford gate
def generate_random_clifford():
return random.choice(["x", "y", "z", "h", "s", "sdg", "t", "tdg"])
# Function to create a quantum volume circuit
def create_qv_kernel(num_qubits):
kernel = cudaq.make_kernel()
qubits = kernel.qalloc(num_qubits)
for layer in range(num_qubits):
for i in range(0, num_qubits - 1, 2):
kernel.cz(qubits[i], qubits[i + 1])
for i in range(num_qubits):
gate = generate_random_clifford()
if gate == "x":
kernel.x(qubits[i])
elif gate == "y":
kernel.y(qubits[i])
elif gate == "z":
kernel.z(qubits[i])
elif gate == "h":
kernel.h(qubits[i])
elif gate == "s":
kernel.s(qubits[i])
elif gate == "sdg":
kernel.rz(-np.pi / 2, qubits[i])
elif gate == "t":
kernel.t(qubits[i])
elif gate == "tdg":
kernel.rz(-np.pi / 4, qubits[i])
for i in range(num_qubits):
kernel.mz(qubits[i])
return kernel
# Simulation settings
num_shots = 1024
qubit_range = range(155, 156, 1) # Simulate for 10 to 100 qubits
num_repeats = 1 # Number of repetitions for moving average
# Results storage
results = {}
# Simulation loop
for target in tqdm(targets, desc="Targets", unit="target"):
target_name = target["name"]
results[target_name] = {
"qubit_counts": [],
"moving_avg_times": []
}
for num_qubits in tqdm(qubit_range, desc=f"{target_name} qubits", unit="qubits", leave=False):
# Skip conditions based on target and qubit count
if target_name in ["nvidia_fp64", "nvidia_mgpu_fp64"] and num_qubits >= 30:
print(f"Cannot simulate {num_qubits} qubits on target {target_name}.")
continue
if target_name in ["nvidia_fp32", "qpp_cpu"] and num_qubits > 30:
print(f"Not simulatable for {num_qubits} qubits on target {target_name}.")
continue
# Restart kernel to ensure clean slate
# restart_kernel()
if target_name in ["tensornet", "tensornet-mps", "qpp-cpu"]:
cudaq.set_target(target["target"])
else:
cudaq.set_target(target["target"], option=target["option"])
cudaq.set_random_seed(SEED)
kernel = create_qv_kernel(num_qubits)
simulation_times = []
moving_avg = 0
for i in range(num_repeats):
start_time = time.time()
result = cudaq.sample(kernel, shots_count=num_shots)
end_time = time.time()
simulation_times.append(end_time - start_time)
file_path = f"simulations_results_immediate_{target_name}_{num_qubits}.json"
current_time = end_time - start_time
moving_avg = (i * moving_avg + current_time) / (i + 1)
# Check if the file exists
if os.path.exists(file_path):
# Load existing data
with open(file_path, "r") as f:
data = json.load(f)
else:
# Initialize new data structure if file doesn't exist
data = {"results": []}
# Append the new moving average result
data["results"].append({"num_qubits": num_qubits, "iteration": i + 1, "moving_avg_time": moving_avg, "simulation_time": current_time})
# Write updated data back to the file
with open(file_path, "w") as f:
json.dump(data, f, indent=4)
# Compute moving average
moving_avg_time = np.mean(simulation_times)
# print(f"Moving average for -{target_name} -- {num_qubits} is {moving_avg_time}s")
# Store results
results[target_name]["qubit_counts"].append(num_qubits)
results[target_name]["moving_avg_times"].append(moving_avg_time)
# Save intermediate results after each target simulation
with open(f"simulation_results_{target_name}.json", "w") as f:
json.dump(results[target_name], f, indent=4)
print(f"Completed simulations for target {target_name}. Results saved to simulation_results_{target_name}.json.")
# Save final results
with open("simulation_results.json", "w") as f:
json.dump(results, f, indent=4)
print("Simulation completed. Results saved to simulation_results.json.")Environment Details
| Component | Version |
|---|---|
| Python | 3.11.11 |
| CUDA Version | 12.2 |
cuda-quantum-cu12 |
0.9.1 |
cudaq |
0.9.1 |
cudensitymat-cu12 |
0.0.5 |
cupy-cuda12x |
13.3.0 |
cuquantum-cu12 |
24.11.0 |
cuquantum-python-cu12 |
24.11.0 |
custatevec-cu12 |
1.7.0 |
cutensor-cu12 |
2.0.2 |
cutensornet-cu12 |
2.6.0 |
Steps to reproduce
- Prepare an environment with the dependencies and versions listed above.
- Run the provided script.
- Monitor GPU memory utilization during simulation using a tool like nvidia-smi.
Expected Behavior
GPU memory utilization should dynamically adjust based on the qubit size and other simulation parameters, potentially scaling up as the computational load increases.
Additional Observation
When running the script multiple times, I've noticed a significant behavior regarding GPU memory consumption:
- First Run: If the script is executed, it consumes approximately 8GB of the available 16GB GPU memory.
- Second Run: Upon executing the script a second time, the memory consumption drops to 4GB, which is half of the available memory. This suggests that the GPU is not reclaiming the previously allocated memory or has an internal cap.