# Multi GPU simulations 

Below we will explore how CUDAQ can seamlessly utilize multiple GPUs and multiple QPUs in the future. 

1. Scale qubit count to access second and third GPU 
2. Distribute collection of x_train on multiple GPUs asynchronously
3. Distribute collection of terms in a given hamiltonian
4. Execute different kernels on different GPUs 
5. Scale achieved with multi-GPU tensor network simulations 



In [None]:
import cudaq
from cudaq import spin
import numpy as np

cudaq.set_target('nvidia')
# cudaq.set_target('nvidia-mgpu')
# cudaq.set_target('qpp-cpu')
# cudaq.set_target('nvidia-mqpu')

# Scaling qubit count to go beyond single GPU memory requirements

In [None]:
n_qubits = 20
n_samples = 1000
h = spin.z(0) 

n_parameters = n_qubits*3
parameters = np.random.default_rng(13).uniform(low=0, high=1, size = (n_samples,n_parameters))
np.random.seed(1)


kernel, params = cudaq.make_kernel(list)

qubits = kernel.qalloc(n_qubits)
qubits_list = list(range(n_qubits))

for i in range(n_qubits):
    kernel.rx(params[i], qubits[i])

for i in range(n_qubits):
    kernel.ry(params[i + n_qubits], qubits[i])

for i in range(n_qubits):
    kernel.rz(params[i + n_qubits*2], qubits[i])

for q1, q2 in zip(qubits_list[0::2], qubits_list[1::2]):
    kernel.cz(qubits[q1], qubits[q2])

# exp_vals = cudaq.observe_n(kernel, h, parameters)

exp_vals = [cudaq.observe(kernel, h, parameters[i]) for i in range(parameters.shape[0])]


Graphs on MULTI GPU simulations scaling qubit counts for statevector and tensor network simulations 





# Asynchronous data collection via batching x_train

In [None]:
print(parameters.shape)

xi = np.split(parameters, 4)

print(len(xi))

print(xi[0].shape)


In [None]:
asyncresults = []

for i in range(len(xi)):
    for j in range(xi[i].shape[0]):
        asyncresults.append(cudaq.observe_async(kernel, h, xi[i][j,:], qpu_id = i))

expvals = []
for res in asyncresults:
    expvals.append(res.get().expectation_z())


# Asynchronous data collection via batching hamiltonian terms

In [None]:
# cudaq.set_qpu('cuquantum_mgpu')

n_qubits = 10
n_samples = 1000

hamiltonian = 5.907 - 2.1433 * spin.x(0) * spin.x(1) - 2.1433 * spin.y(
    0) * spin.y(1) + .21829 * spin.z(0) - 6.125 * spin.z(1)

n_parameters = n_qubits*3
parameters = np.random.default_rng(13).uniform(low=0, high=1, size = (n_samples,n_parameters))
np.random.seed(1)


kernel, params = cudaq.make_kernel(list)

qubits = kernel.qalloc(n_qubits)
qubits_list = list(range(n_qubits))

for i in range(n_qubits):
    kernel.rx(params[i], qubits[i])

for i in range(n_qubits):
    kernel.ry(params[i + n_qubits], qubits[i])

for i in range(n_qubits):
    kernel.rz(params[i + n_qubits*2], qubits[i])

for q1, q2 in zip(qubits_list[0::2], qubits_list[1::2]):
    kernel.cz(qubits[q1], qubits[q2])

# exp_vals = cudaq.observe_n(kernel, h, parameters)


# Different kernels being executed at the same time 

In [None]:
cudaq.set_target('nvidia-mgpu')

n_qubits = 10
n_samples = 500
h = spin.z(0) 

n_parameters = n_qubits
parameters = np.random.default_rng(13).uniform(low=0, high=1, size = (n_samples,n_parameters))
np.random.seed(1)

###################################################

kernel1, params = cudaq.make_kernel(list)

qubits = kernel1.qalloc(n_qubits)

for i in range(n_qubits):
    kernel1.rx(params[i], qubits[i])

###################################################

kernel2, params = cudaq.make_kernel(list)

qubits = kernel2.qalloc(n_qubits)

for i in range(n_qubits):
    kernel2.rx(params[i], qubits[i])



In [None]:
asyncresults = []

for i in range(len(xi)):
    for j in range(xi[i].shape[0]):
        asyncresults.append(cudaq.observe_async(kernel, h, xi[i][j,:], qpu_id = i))

expvals = []
for res in asyncresults:
    expvals.append(res.get().expectation_z())


In [None]:

exp_vals1 = [cudaq.observe_async(kernel1, h, parameters[i], qpu_id = 0) for i in range(parameters.shape[0])]

exp_vals2 = [cudaq.observe_async(kernel2, h, parameters[i], qpu_id = 1) for i in range(parameters.shape[0])]





# Unparalleled scale with tensor networks 

In [None]:
dir(cudaq)

In [None]:
cudaq.set_target('tensornet')

# Future syntax: simultaneous QPU-GPU workflows 

In [None]:
#simultaneous gpu qpu execution 

cudaq.sample_n_async(kernel, h, x_train, params, qpu_id = 'Rigetti-Aspen-X')


cudaq.sample_n_async(kernel_clifford_approximation, h, x_train, params, qpu_id = 'Clifford-Simulator')
