In [None]:
from IPython.display import clear_output
!pip install -q tensorflow-quantum==0.6.1 --use-deprecated=legacy-resolver --quiet
!pip install -q cirq==0.14.1 
clear_output()

In [None]:
import tensorflow as tf
print('tensorflow version', tf.__version__)
gpus = tf.config.list_physical_devices('GPU')
print(gpus)

if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

tensorflow version 2.8.2
[]


## CuQuantum

In [None]:
!wget https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum/linux-x86_64/cuquantum-linux-x86_64-22.07.1.14-archive.tar.xz
!tar -xvf cuquantum-linux-x86_64-22.07.1.14-archive.tar.xz

--2022-08-12 03:28:41--  https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum/linux-x86_64/cuquantum-linux-x86_64-22.07.1.14-archive.tar.xz
Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 152.195.19.142
Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|152.195.19.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11092132 (11M) [application/octet-stream]
Saving to: ‘cuquantum-linux-x86_64-22.07.1.14-archive.tar.xz’


2022-08-12 03:28:41 (98.2 MB/s) - ‘cuquantum-linux-x86_64-22.07.1.14-archive.tar.xz’ saved [11092132/11092132]

cuquantum-linux-x86_64-22.07.1.14-archive/
cuquantum-linux-x86_64-22.07.1.14-archive/include/
cuquantum-linux-x86_64-22.07.1.14-archive/include/cutensornet/
cuquantum-linux-x86_64-22.07.1.14-archive/include/cutensornet/types.h
cuquantum-linux-x86_64-22.07.1.14-archive/include/cutensornet.h
cuquantum-linux-x86_64-22.07.1.14-archive/include/custatevec.h
cuquantum-lin

In [None]:
!mkdir /usr/local/cuquantum
!cp -a /content/cuquantum-linux-x86_64-22.07.1.14-archive/. /usr/local/cuquantum/

In [None]:
import os
os.environ['CUQUANTUM_ROOT']='/usr/local/cuquantum'
os.environ['CUQUANTUM_DIR']='/usr/local/cuquantum'
os.environ['LD_LIBRARY_PATH']=f"/usr/local/cuquantum/lib:{os.environ['LD_LIBRARY_PATH']}"
os.environ['PATH']=f"/usr/local/cuquantum/lib:{os.environ['PATH']}"

In [None]:
!export | grep 'PATH'

declare -x LD_LIBRARY_PATH="/usr/local/cuquantum/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
declare -x LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
declare -x PATH="/usr/local/cuquantum/lib:/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin"
declare -x PYTHONPATH="/env/python"


### Verify Cuquantum

In [None]:
%%writefile test.cu
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cuComplex.h>        // cuDoubleComplex
#include <custatevec.h>       // custatevecApplyMatrix
#include <stdio.h>            // printf
#include <stdlib.h>           // EXIT_FAILURE

int main(void) {

   const int nIndexBits = 3;
   const int nSvSize    = (1 << nIndexBits);
   const int nTargets   = 1;
   const int nControls  = 2;
   const int adjoint    = 0;

   int targets[]  = {2};
   int controls[] = {0, 1};

   cuDoubleComplex h_sv[]        = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1},
                                    { 0.1, 0.2}, { 0.2, 0.2}, { 0.3, 0.3},
                                    { 0.3, 0.4}, { 0.4, 0.5}};
   cuDoubleComplex h_sv_result[] = {{ 0.0, 0.0}, { 0.0, 0.1}, { 0.1, 0.1},
                                    { 0.4, 0.5}, { 0.2, 0.2}, { 0.3, 0.3},
                                    { 0.3, 0.4}, { 0.1, 0.2}};
   cuDoubleComplex matrix[] = {{0.0, 0.0}, {1.0, 0.0},
                               {1.0, 0.0}, {0.0, 0.0}};


   cuDoubleComplex *d_sv;
   cudaMalloc((void**)&d_sv, nSvSize * sizeof(cuDoubleComplex));

   cudaMemcpy(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex),
              cudaMemcpyHostToDevice);

   //--------------------------------------------------------------------------

   // custatevec handle initialization
   custatevecHandle_t handle;

   custatevecCreate(&handle);

   void* extraWorkspace = nullptr;
   size_t extraWorkspaceSizeInBytes = 0;

   // check the size of external workspace
   custatevecApplyMatrixGetWorkspaceSize(
       handle, CUDA_C_64F, nIndexBits, matrix, CUDA_C_64F,
       CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, nTargets, nControls,
       CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes);

   // allocate external workspace if necessary
   if (extraWorkspaceSizeInBytes > 0)
       cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes);

   // apply gate
   custatevecApplyMatrix(
       handle, d_sv, CUDA_C_64F, nIndexBits, matrix, CUDA_C_64F,
       CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets, nTargets, controls,
       nullptr, nControls, CUSTATEVEC_COMPUTE_64F,
       extraWorkspace, extraWorkspaceSizeInBytes);

   // destroy handle
   custatevecDestroy(handle);

   //--------------------------------------------------------------------------

   cudaMemcpy(h_sv, d_sv, nSvSize * sizeof(cuDoubleComplex),
              cudaMemcpyDeviceToHost);

   bool correct = true;
   for (int i = 0; i < nSvSize; i++) {
       if ((h_sv[i].x != h_sv_result[i].x) ||
           (h_sv[i].y != h_sv_result[i].y)) {
           correct = false;
           break;
       }
   }

   if (correct)
       printf("example PASSED\n");
   else
       printf("example FAILED: wrong result\n");

   cudaFree(d_sv);
   if (extraWorkspaceSizeInBytes)
       cudaFree(extraWorkspace);

   return EXIT_SUCCESS;
}

Writing test.cu


In [None]:
!nvcc test.cu -I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -lcustatevec -o test
!./test

example FAILED: wrong result


## QSimCirq

In [None]:
!pip install -q pybind11

[?25l[K     |█▌                              | 10 kB 31.3 MB/s eta 0:00:01[K     |███                             | 20 kB 39.0 MB/s eta 0:00:01[K     |████▋                           | 30 kB 43.3 MB/s eta 0:00:01[K     |██████▏                         | 40 kB 27.9 MB/s eta 0:00:01[K     |███████▊                        | 51 kB 21.3 MB/s eta 0:00:01[K     |█████████▏                      | 61 kB 24.3 MB/s eta 0:00:01[K     |██████████▊                     | 71 kB 24.7 MB/s eta 0:00:01[K     |████████████▎                   | 81 kB 25.6 MB/s eta 0:00:01[K     |█████████████▉                  | 92 kB 27.7 MB/s eta 0:00:01[K     |███████████████▍                | 102 kB 29.5 MB/s eta 0:00:01[K     |█████████████████               | 112 kB 29.5 MB/s eta 0:00:01[K     |██████████████████▍             | 122 kB 29.5 MB/s eta 0:00:01[K     |████████████████████            | 133 kB 29.5 MB/s eta 0:00:01[K     |█████████████████████▌          | 143 kB 29.5 MB/s eta 0:

In [None]:
!git clone https://github.com/quantumlib/qsim.git
%cd qsim
!make -j8 
!pip install .
%cd ..

Cloning into 'qsim'...
remote: Enumerating objects: 7150, done.[K
remote: Counting objects: 100% (295/295), done.[K
remote: Compressing objects: 100% (160/160), done.[K
remote: Total 7150 (delta 143), reused 241 (delta 119), pack-reused 6855[K
Receiving objects: 100% (7150/7150), 5.90 MiB | 26.26 MiB/s, done.
Resolving deltas: 100% (4746/4746), done.
/content/qsim
make -C apps/ qsim
make -C pybind_interface/ pybind
make[1]: Entering directory '/content/qsim/apps'
make[1]: Entering directory '/content/qsim/pybind_interface'
g++ basic/pybind_main_basic.cpp -o ../qsimcirq/qsim_basic`python3-config --extension-suffix` -O3 -fopenmp -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
nvcc cuda/pybind_main_cuda.cpp -o ../qsimcirq/qsim_cuda`python3-config --extension-suffix` -O3 -std=c++14 -x cu -Xcompiler "-Wall -shared -fPIC `python3 -m pybind11 --includes`"
nvcc custatevec/pybind_main_custatevec.cpp -o ../qsimcirq/qsim_custatevec`python3-config --extension-suffix` -O3 -I/usr

/content


In [None]:
import qsimcirq
print(qsimcirq.qsim_gpu)

<module 'qsimcirq.qsim_cuda' from '/usr/local/lib/python3.7/dist-packages/qsimcirq/qsim_cuda.cpython-37m-x86_64-linux-gnu.so'>


### Verify Qsim with cuquantum

In [None]:
# Import Cirq and qsim
import cirq
import qsimcirq

# Instantiate qubits and create a circuit
q0, q1 = cirq.LineQubit.range(2)
circuit = cirq.Circuit(cirq.H(q0), cirq.CX(q0, q1))

# Instantiate a simulator that uses the GPU
gpu_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=1)
qsim_simulator = qsimcirq.QSimSimulator(qsim_options=gpu_options)

# Run the simulation
print("Running simulation for the following circuit:")
print(circuit)

qsim_results = qsim_simulator.compute_amplitudes(
    circuit, bitstrings=[0b00, 0b01])

print("qsim results:")
print(qsim_results)

Running simulation for the following circuit:
0: ───H───@───
          │
1: ───────X───
qsim results:
[(0.7071067690849304+0j), 0j]


## Imports

In [None]:
from sklearn.decomposition import PCA
import tensorflow as tf
import tensorflow_quantum as tfq

import cirq
import sympy
import numpy as np
import seaborn as sns
import collections
import time

# visualization tools
%matplotlib inline
import matplotlib.pyplot as plt
from cirq.contrib.svg import SVGCircuit

In [None]:
n_qubits = 20
n_layers = 1
n_train_samples_per_class = 1000
n_test_samples_per_class = 200

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# Rescale the images from [0,255] to the [0.0,1.0] range.
x_train, x_test = x_train[..., np.newaxis]/255.0, x_test[..., np.newaxis]/255.0

print("Number of original training examples:", len(x_train))
print("Number of original test examples:", len(x_test))

Number of original training examples: 60000
Number of original test examples: 10000


In [None]:
def filter_36(x, y):
    keep = (y == 3) | (y == 6)
    x, y = x[keep], y[keep]
    y = y == 3
    return x,y

In [None]:
x_train, y_train = filter_36(x_train, y_train)
x_test, y_test = filter_36(x_test, y_test)

x_train = np.concatenate((x_train[y_train==0][:n_train_samples_per_class], x_train[y_train==1][:n_train_samples_per_class]))
y_train = np.concatenate((y_train[y_train==0][:n_train_samples_per_class], y_train[y_train==1][:n_train_samples_per_class]))

x_test = np.concatenate((x_test[y_test==0][:n_test_samples_per_class], x_test[y_test==1][:n_test_samples_per_class]))
y_test = np.concatenate((y_test[y_test==0][:n_test_samples_per_class], y_test[y_test==1][:n_test_samples_per_class]))

num_3 = x_train[y_train == 0].shape[0]
num_6 = x_train[y_train == 1].shape[0]

print("Number of filtered training examples:", len(x_train))
print("Number of unique 3s: ", num_3)
print("Number of unique 6s: ", num_6)
print("Number of filtered test examples:", len(x_test))

Number of filtered training examples: 2000
Number of unique 3s:  1000
Number of unique 6s:  1000
Number of filtered test examples: 400


In [None]:
def pca(n, x_train, x_test):
    dims = x_train.shape[1:]
    pca_obj = PCA(n_components=n)
    x_train = x_train.reshape(-1, np.prod(dims))
    x_test = x_test.reshape(-1, np.prod(dims))

    x_train = pca_obj.fit_transform(x_train)
    cumsum = np.cumsum(pca_obj.explained_variance_ratio_ * 100)[-1]
    print("Cumulative sum on train :", cumsum)

    x_test = pca_obj.transform(x_test)
    cumsum = np.cumsum(pca_obj.explained_variance_ratio_ * 100)[-1]
    print("Cumulative sum on test:", cumsum)
    
    return x_train, x_test

In [None]:
x_train_small, x_test_small = pca(n_qubits,x_train,x_test)

Cumulative sum on train : 70.06794359696323
Cumulative sum on test: 70.06794359696323


In [None]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)

In [None]:
def one_qubit_rotation(qubit, symbols):
    """
    Returns Cirq gates that apply a rotation of the bloch sphere about the X,
    Y and Z axis, specified by the values in `symbols`.
    """
    return [cirq.rx(symbols[0])(qubit),
            cirq.ry(symbols[1])(qubit),
            cirq.rz(symbols[2])(qubit)]

def entangling_layer(qubits):
    """
    Returns a layer of CZ entangling gates on `qubits` (arranged in a circular topology).
    """
    if len(qubits) == 1:
        return []
    cz_ops = [cirq.CZ(q0, q1) for q0, q1 in zip(qubits, qubits[1:])]
    cz_ops += ([cirq.CZ(qubits[0], qubits[-1])] if len(qubits) != 2 else [])
    return cz_ops

In [None]:
def generate_circuit(n_qubits, n_layers):
    """Create a QNN model circuit and readout operation to go along with it."""
    qubits = cirq.GridQubit.rect(1, n_qubits)
    
    # Sympy symbols for variational angles
    params = sympy.symbols(f'θ(0:{3*(n_layers+1)*n_qubits})')
    params = np.asarray(params).reshape((n_layers+1, n_qubits, 3))

    # Sympy symbols for encoding angles
    inputs = sympy.symbols(f'x(0:{n_layers})'+f'_(0:{n_qubits})')
    inputs = np.asarray(inputs).reshape((n_layers, n_qubits))

    # Define circuit
    circuit = cirq.Circuit()
    for l in range(n_layers):
        # Variational layer
        circuit += cirq.Circuit(one_qubit_rotation(q, params[l, i]) for i, q in enumerate(qubits))
        circuit += entangling_layer(qubits)
        # Encoding layer
        circuit += cirq.Circuit(cirq.rx(inputs[l, i])(q) for i, q in enumerate(qubits))

    # Last varitional layer
    circuit += cirq.Circuit(one_qubit_rotation(q, params[n_layers, i]) for i,q in enumerate(qubits))
    
    observables = []
    for i in range(n_qubits):
        observables += [
            cirq.X(qubits[i]),
            cirq.Y(qubits[i]),
            cirq.Z(qubits[i])
        ]

    return circuit, observables, list(params.flat), list(inputs.flat)

In [None]:
circuit, _,_, _ = generate_circuit(n_qubits,n_layers)
circuit

In [None]:
class ReUploadingPQC(tf.keras.layers.Layer):
    """
    Performs the transformation (s_1, ..., s_d) -> (theta_1, ..., theta_N, lmbd[1][1]s_1, ..., lmbd[1][M]s_1,
        ......., lmbd[d][1]s_d, ..., lmbd[d][M]s_d) for d=input_dim, N=theta_dim and M=n_layers.
    An activation function from tf.keras.activations, specified by `activation` ('linear' by default) is
        then applied to all lmbd[i][j]s_i.
    All angles are finally permuted to follow the alphabetical order of their symbol names, as processed
        by the ControlledPQC.
    """

    def __init__(self, n_qubits, n_layers, activation="linear", name="re-uploading_PQC"):
        super(ReUploadingPQC, self).__init__(name=name)
        self.n_layers = n_layers
        self.n_qubits = n_qubits

        circuit, observables, theta_symbols, input_symbols = generate_circuit(n_qubits, n_layers)

        theta_init = tf.random_uniform_initializer(minval=0.0, maxval=np.pi)
        self.theta = tf.Variable(
            initial_value=theta_init(shape=(1, len(theta_symbols)), dtype="float32"),
            trainable=True, name="thetas"
        )

        lmbd_init = tf.ones(shape=(self.n_qubits * self.n_layers,))
        self.lmbd = tf.Variable(
            initial_value=lmbd_init, dtype="float32", trainable=True, name="lambdas"
        )

        # Define explicit symbol order.
        symbols = [str(symb) for symb in theta_symbols + input_symbols]
        self.indices = tf.constant([symbols.index(a) for a in sorted(symbols)])

        self.activation = activation
        self.empty_circuit = tfq.convert_to_tensor([cirq.Circuit()])
        
        # Select backend
        backend = 'noiseless'
        gpus = tf.config.list_physical_devices('GPU')
        
        if gpus:
            # Instantiate a simulator that uses the GPU
            gpu_options = qsimcirq.QSimOptions(use_gpu=True, gpu_mode=1,verbosity=1, cpu_threads=8)
            backend = qsimcirq.QSimSimulator(qsim_options=gpu_options)
            print("Using qsimcirq")
        
        self.computation_layer = tfq.layers.ControlledPQC(circuit, 
                                                          observables,
                                                          differentiator=tfq.differentiators.Adjoint(), 
                                                          backend=backend)        

    def call(self, inputs):

        batch_dim = tf.shape(inputs)[0]
        tiled_up_circuits = tf.repeat(self.empty_circuit, repeats=batch_dim)
        tiled_up_thetas = tf.tile(self.theta, multiples=[batch_dim, 1])
        tiled_up_inputs = tf.tile(inputs, multiples=[1, self.n_layers])
        scaled_inputs = tf.einsum("i,ji->ji", self.lmbd, tiled_up_inputs)
        squashed_inputs = tf.keras.layers.Activation(self.activation)(scaled_inputs)

        joined_vars = tf.concat([tiled_up_thetas, squashed_inputs], axis=1)
        joined_vars = tf.gather(joined_vars, self.indices, axis=1)

        return self.computation_layer([tiled_up_circuits, joined_vars])

In [None]:
# Build the Keras model.
model = tf.keras.Sequential([
    tf.keras.Input(shape=(n_qubits,), dtype=tf.dtypes.float32, name='input'),
    ReUploadingPQC(n_qubits, n_layers),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 re-uploading_PQC (ReUploadi  (None, 60)               140       
 ngPQC)                                                          
                                                                 
 flatten_1 (Flatten)         (None, 60)                0         
                                                                 
 dense_2 (Dense)             (None, 8)                 488       
                                                                 
 dense_3 (Dense)             (None, 2)                 18        
                                                                 
Total params: 646
Trainable params: 646
Non-trainable params: 0
_________________________________________________________________


In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.2)
model.compile(opt, loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
batch_size = 256
epochs = 5

In [None]:
qnn_history = model.fit(
      x_train_small, y_train,
      batch_size=batch_size,
      epochs=epochs,
      validation_data=(x_test_small, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
qnn_results = model.evaluate(x_test_small, y_test)



# Benchmarks

- MNIST Dataset binary (3 and 6)
- Number of filtered training examples: 2000
- Number of unique 3s:  1000
- Number of unique 6s:  1000
- Number of filtered test examples: 400

## Parameter growth

Number of trainable parameters (quantum only)

### Quantum
| qubits/layers | 1 | 2 |
| ----------- | ----------- | ----------- |
| 10      |   70     |  110     |
| 16   |   112      |  176       |
| 20      |  140      |       |
| 23   |         |        |
| 25   |        |         |
| 27   |         |        |


## Time

The results below shows time per epoch averaged over 5 epochs with val acc

### Default C++ backend (CPU)
| qubits/layers | 1 | acc | 2 | acc |
| ----------- | ----------- | ----------- | ----------- | ----------- |
| 10      |  11      |   97.5    |  20.6     |  98.25     |
| 16   |   266.6      |   96.75      | 251.4      |  97     |
| 20      |  3353      |     97.5  |       |       |
| 23   |         |        |       |       |
| 25   |        |         |       |       |
| 27   |         |        |       |       |

### QSimCirq + CuQuantum (GPU)

| qubits/layers | 1 | acc | 2 | acc |
| ----------- | ----------- | ----------- | ----------- | ----------- |
| 10      | 246       |   82.25    |  385     | 92      |
| 16   |     465    | 98        |   676    |  98.25     |
| 20      |   2767.5     |  64.5     |       |       |
| 23   |         |        |       |       |
| 25   |        |         |       |       |
| 27   |         |        |       |       |