# Transformers

In [1]:
import os
import sys

os.chdir("..")
sys.path.append(os.getcwd())
os.getcwd()

'/group/pmc026/nchoong/QuantumTransformer'

In [2]:
from transformer.pytorch.main_no_embed import main
from transformer.pytorch.utils.plots import plot_metrics

2024-09-24 14:59:39.161163: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-24 14:59:39.173621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-24 14:59:39.189199: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-24 14:59:39.193948: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-24 14:59:39.205474: I tensorflow/core/platform/cpu_feature_guar

In [3]:
from config import dev

dev

device(type='cuda')

In [4]:
import random
import numpy as np
import torch
import tensorflow as tf

In [5]:
seed = 42

os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ["TF_DETERMINISTIC_OPS"] = "1"
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)

os.environ["OMP_NUM_THREADS"] = str(os.cpu_count())
os.environ["TF_NUM_INTEROP_THREADS"] = str(os.cpu_count())
os.environ["TF_NUM_INTRAOP_THREADS"] = str(os.cpu_count())

tf.config.threading.set_intra_op_parallelism_threads(os.cpu_count())
tf.config.threading.set_inter_op_parallelism_threads(os.cpu_count())

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Classical

In [6]:
train_loss, train_acc, val_loss, val_acc, train_auc, val_auc, best_dict = main(
    n_epochs=4,
    sample_size=5000,
    batch_size=64,
    embed_dim=8,
    num_heads=2,
    num_blocks=2,
    tqdm_disabled=False,
    batch=True,
)

classiscal_metrics = {
    "train_loss": train_loss,
    "train_acc": train_acc,
    "val_loss": val_loss,
    "val_acc": val_acc,
    "train_auc": train_auc,
    "val_auc": val_auc,
    "best_dict": best_dict,
}

train_data:  4000
pos:  1979
neg:  2021
pos:  505
neg:  495
pos:  1233
neg:  1267
++ There will be 2 transformer blocks
The model has 7,297 trainable parameters


Epoch   1/4: 100%|██████████| 63/63 [00:02<00:00, 30.08batch/s, Epoch = 0m 2s, Loss = 0.6401|0.5691, Acc = 0.650|0.714, AUC = 70.472|83.856]
Epoch   2/4: 100%|██████████| 63/63 [00:01<00:00, 36.05batch/s, Epoch = 0m 1s, Loss = 0.5162|0.4888, Acc = 0.756|0.759, AUC = 83.285|86.658]
Epoch   3/4: 100%|██████████| 63/63 [00:02<00:00, 22.86batch/s, Epoch = 0m 2s, Loss = 0.4487|0.4422, Acc = 0.802|0.794, AUC = 87.735|87.934]
Epoch   4/4: 100%|██████████| 63/63 [00:01<00:00, 42.90batch/s, Epoch = 0m 1s, Loss = 0.4209|0.4402, Acc = 0.819|0.796, AUC = 89.311|88.102]

TOTAL TIME = 8.68s
BEST ACC = 0.80% AT EPOCH 4
BEST AUC = 88.10 AT EPOCH 4





## Quantum with Pennylane GPU

In [11]:
train_loss, train_acc, val_loss, val_acc, train_auc, val_auc, best_dict = main(
    n_epochs=4,
    sample_size=5000,
    batch_size=64,
    embed_dim=8,
    num_heads=2,
    num_blocks=2,
    n_qubits_transformer=8,
    n_qubits_ffn=8,
    n_qlayers=3,
    tqdm_disabled=False,
    q_device="default.qubit.torch",
    batch=False,
    circuit_type="pennylane",
)

quantum_pl_gpu_metrics = {
    "train_loss": train_loss,
    "train_acc": train_acc,
    "val_loss": val_loss,
    "val_acc": val_acc,
    "train_auc": train_auc,
    "val_auc": val_auc,
    "best_dict": best_dict,
}



train_data:  4000
pos:  1962
neg:  2038
pos:  506
neg:  494
pos:  1270
neg:  1230
++ There will be 2 transformer blocks
++ Transformer will use 8 qubits and 3 q layers
The feed-forward head will use 8 qubits
Using PennyLane quantum device default.qubit.torch
weight_shapes = (n_qlayers, n_qubits) = (3, 8)
The model has 6,753 trainable parameters


Epoch   1/4: 100%|██████████| 63/63 [42:10<00:00, 40.17s/batch, Epoch = 42m 10s, Loss = 0.6612|0.5767, Acc = 0.610|0.770, AUC = 67.374|85.950]
Epoch   2/4: 100%|██████████| 63/63 [40:18<00:00, 38.39s/batch, Epoch = 40m 18s, Loss = 0.5603|0.5304, Acc = 0.762|0.744, AUC = 84.141|87.303]
Epoch   3/4: 100%|██████████| 63/63 [41:31<00:00, 39.55s/batch, Epoch = 41m 31s, Loss = 0.5082|0.4757, Acc = 0.785|0.794, AUC = 86.155|88.207]
Epoch   4/4: 100%|██████████| 63/63 [41:48<00:00, 39.81s/batch, Epoch = 41m 48s, Loss = 0.4763|0.4578, Acc = 0.802|0.796, AUC = 88.021|88.366]

TOTAL TIME = 9948.50s
BEST ACC = 0.80% AT EPOCH 4
BEST AUC = 88.37 AT EPOCH 4





## Quantum with Pennylane CPU

In [12]:
train_loss, train_acc, val_loss, val_acc, train_auc, val_auc, best_dict = main(
    n_epochs=4,
    sample_size=5000,
    batch_size=64,
    embed_dim=8,
    num_heads=2,
    num_blocks=2,
    n_qubits_transformer=8,
    n_qubits_ffn=8,
    n_qlayers=3,
    tqdm_disabled=False,
    q_device="default.qubit",
    batch=False,
    circuit_type="pennylane",
)

quantum_pl_cpu_metrics = {
    "train_loss": train_loss,
    "train_acc": train_acc,
    "val_loss": val_loss,
    "val_acc": val_acc,
    "train_auc": train_auc,
    "val_auc": val_auc,
    "best_dict": best_dict,
}

train_data:  4000
pos:  2014
neg:  1986
pos:  474
neg:  526
pos:  1206
neg:  1294
++ There will be 2 transformer blocks
++ Transformer will use 8 qubits and 3 q layers
The feed-forward head will use 8 qubits
Using PennyLane quantum device default.qubit
weight_shapes = (n_qlayers, n_qubits) = (3, 8)
The model has 6,753 trainable parameters


Epoch   1/4:   0%|          | 0/63 [00:00<?, ?batch/s]

Epoch   1/4: 100%|██████████| 63/63 [57:33<00:00, 54.82s/batch, Epoch = 57m 33s, Loss = 0.6675|0.5823, Acc = 0.618|0.757, AUC = 65.804|85.028]
Epoch   2/4: 100%|██████████| 63/63 [55:31<00:00, 52.89s/batch, Epoch = 55m 31s, Loss = 0.5747|0.5467, Acc = 0.746|0.732, AUC = 82.750|86.684]
Epoch   3/4: 100%|██████████| 63/63 [55:32<00:00, 52.90s/batch, Epoch = 55m 32s, Loss = 0.5154|0.5850, Acc = 0.782|0.684, AUC = 86.517|87.166]
Epoch   4/4: 100%|██████████| 63/63 [54:54<00:00, 52.30s/batch, Epoch = 54m 54s, Loss = 0.4926|0.4769, Acc = 0.787|0.789, AUC = 87.304|87.588]

TOTAL TIME = 13413.26s
BEST ACC = 0.79% AT EPOCH 4
BEST AUC = 87.59 AT EPOCH 4





## Quantum with Pennylane GPU and Batch

In [7]:
train_loss, train_acc, val_loss, val_acc, train_auc, val_auc, best_dict = main(
    n_epochs=4,
    sample_size=5000,
    batch_size=64,
    embed_dim=8,
    num_heads=2,
    num_blocks=2,
    n_qubits_transformer=8,
    n_qubits_ffn=8,
    n_qlayers=3,
    tqdm_disabled=False,
    q_device="default.qubit.torch",
    batch=True,
    circuit_type="pennylane",
)

quantum_pl_gpu_metrics_batch = {
    "train_loss": train_loss,
    "train_acc": train_acc,
    "val_loss": val_loss,
    "val_acc": val_acc,
    "train_auc": train_auc,
    "val_auc": val_auc,
    "best_dict": best_dict,
}



train_data:  4000
pos:  2044
neg:  1956
pos:  497
neg:  503
pos:  1272
neg:  1228
++ There will be 2 transformer blocks
++ Transformer will use 8 qubits and 3 q layers
The feed-forward head will use 8 qubits
Using PennyLane quantum device default.qubit.torch
weight_shapes = (n_qlayers, n_qubits) = (3, 8)
The model has 6,753 trainable parameters


Epoch   1/4: 100%|██████████| 63/63 [01:27<00:00,  1.38s/batch, Epoch = 1m 27s, Loss = 0.6784|0.6062, Acc = 0.558|0.718, AUC = 59.375|85.048]
Epoch   2/4: 100%|██████████| 63/63 [01:24<00:00,  1.34s/batch, Epoch = 1m 24s, Loss = 0.5832|0.5576, Acc = 0.747|0.706, AUC = 81.866|86.360]
Epoch   3/4: 100%|██████████| 63/63 [01:24<00:00,  1.35s/batch, Epoch = 1m 24s, Loss = 0.5288|0.5110, Acc = 0.769|0.745, AUC = 85.159|87.002]
Epoch   4/4: 100%|██████████| 63/63 [01:24<00:00,  1.34s/batch, Epoch = 1m 24s, Loss = 0.4915|0.4838, Acc = 0.796|0.774, AUC = 87.707|87.214]

TOTAL TIME = 340.71s
BEST ACC = 0.77% AT EPOCH 4
BEST AUC = 87.21 AT EPOCH 4





## Quantum with Pennylane CPU and Batch

In [8]:
train_loss, train_acc, val_loss, val_acc, train_auc, val_auc, best_dict = main(
    n_epochs=4,
    sample_size=5000,
    batch_size=64,
    embed_dim=8,
    num_heads=2,
    num_blocks=2,
    n_qubits_transformer=8,
    n_qubits_ffn=8,
    n_qlayers=3,
    tqdm_disabled=False,
    q_device="default.qubit",
    batch=True,
    circuit_type="pennylane",
)

quantum_pl_cpu_metrics_batch = {
    "train_loss": train_loss,
    "train_acc": train_acc,
    "val_loss": val_loss,
    "val_acc": val_acc,
    "train_auc": train_auc,
    "val_auc": val_auc,
    "best_dict": best_dict,
}

train_data:  4000
pos:  1985
neg:  2015
pos:  505
neg:  495
pos:  1260
neg:  1240
++ There will be 2 transformer blocks
++ Transformer will use 8 qubits and 3 q layers
The feed-forward head will use 8 qubits
Using PennyLane quantum device default.qubit
weight_shapes = (n_qlayers, n_qubits) = (3, 8)
The model has 6,753 trainable parameters


Epoch   1/4: 100%|██████████| 63/63 [01:59<00:00,  1.90s/batch, Epoch = 1m 59s, Loss = 0.6495|0.5641, Acc = 0.636|0.736, AUC = 68.409|82.882]
Epoch   2/4: 100%|██████████| 63/63 [02:01<00:00,  1.92s/batch, Epoch = 2m 1s, Loss = 0.5290|0.5218, Acc = 0.776|0.739, AUC = 84.919|84.557]
Epoch   3/4: 100%|██████████| 63/63 [01:59<00:00,  1.90s/batch, Epoch = 1m 59s, Loss = 0.4824|0.4833, Acc = 0.797|0.771, AUC = 87.092|85.415]
Epoch   4/4: 100%|██████████| 63/63 [02:03<00:00,  1.96s/batch, Epoch = 2m 3s, Loss = 0.4525|0.4883, Acc = 0.813|0.764, AUC = 88.654|85.487]

TOTAL TIME = 483.71s
BEST ACC = 0.77% AT EPOCH 3
BEST AUC = 85.49 AT EPOCH 4





## Quantum with Tensorcircuit (Tensorflow)

In [10]:
train_loss, train_acc, val_loss, val_acc, train_auc, val_auc, best_dict = main(
    n_epochs=4,
    sample_size=5000,
    batch_size=64,
    embed_dim=8,
    num_heads=2,
    num_blocks=2,
    n_qubits_transformer=8,
    n_qubits_ffn=8,
    n_qlayers=3,
    tqdm_disabled=False,
    batch=False,
    circuit_type="tensorcircuit",
)

quantum_tc_cpu_metrics = {
    "train_loss": train_loss,
    "train_acc": train_acc,
    "val_loss": val_loss,
    "val_acc": val_acc,
    "train_auc": train_auc,
    "val_auc": val_auc,
    "best_dict": best_dict,
}

train_data:  4000
pos:  2008
neg:  1992
pos:  525
neg:  475
pos:  1292
neg:  1208
++ There will be 2 transformer blocks
++ Transformer will use 8 qubits and 3 q layers
The feed-forward head will use 8 qubits
Using TensorCircuit
weight_shapes = (n_qlayers, n_qubits) = (3, 8)
The model has 6,753 trainable parameters


Epoch   1/4: 100%|██████████| 63/63 [32:09<00:00, 30.62s/batch, Epoch = 32m 9s, Loss = 0.6526|0.5506, Acc = 0.622|0.789, AUC = 67.765|87.201]
Epoch   2/4: 100%|██████████| 63/63 [31:18<00:00, 29.82s/batch, Epoch = 31m 18s, Loss = 0.5287|0.5079, Acc = 0.778|0.754, AUC = 85.330|88.927]
Epoch   3/4: 100%|██████████| 63/63 [30:05<00:00, 28.65s/batch, Epoch = 30m 5s, Loss = 0.4834|0.4423, Acc = 0.790|0.821, AUC = 87.018|89.550]
Epoch   4/4: 100%|██████████| 63/63 [27:27<00:00, 26.14s/batch, Epoch = 27m 27s, Loss = 0.4482|0.4400, Acc = 0.816|0.812, AUC = 89.517|89.571]

TOTAL TIME = 7259.99s
BEST ACC = 0.82% AT EPOCH 3
BEST AUC = 89.57 AT EPOCH 4





## Quantum with Tensorcircuit (Tensorflow) and Batch

In [9]:
train_loss, train_acc, val_loss, val_acc, train_auc, val_auc, best_dict = main(
    n_epochs=4,
    sample_size=5000,
    batch_size=64,
    embed_dim=8,
    num_heads=2,
    num_blocks=2,
    n_qubits_transformer=8,
    n_qubits_ffn=8,
    n_qlayers=3,
    tqdm_disabled=False,
    batch=True,
    circuit_type="tensorcircuit",
)

quantum_tc_cpu_metrics_batch = {
    "train_loss": train_loss,
    "train_acc": train_acc,
    "val_loss": val_loss,
    "val_acc": val_acc,
    "train_auc": train_auc,
    "val_auc": val_auc,
    "best_dict": best_dict,
}

train_data:  4000
pos:  2024
neg:  1976
pos:  499
neg:  501
pos:  1271
neg:  1229
++ There will be 2 transformer blocks
++ Transformer will use 8 qubits and 3 q layers
The feed-forward head will use 8 qubits
Using TensorCircuit
weight_shapes = (n_qlayers, n_qubits) = (3, 8)
The model has 6,753 trainable parameters


Epoch   1/4:   0%|          | 0/63 [00:00<?, ?batch/s]2024-09-24 15:14:54.002367: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2024-09-24 15:14:54.002519: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8246 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
2024-09-24 15:14:54.002815: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2024-09-24 15:14:54.002974: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30828 MB memory:  -> device: 1, name: Tesla V100-PCIE-32GB, pci bus id: 0000:d8:00.0, compute capability:



Epoch   1/4:  98%|█████████▊| 62/63 [01:25<00:00,  1.99batch/s]



Epoch   1/4: 100%|██████████| 63/63 [02:49<00:00,  2.70s/batch, Epoch = 2m 49s, Loss = 0.6674|0.6166, Acc = 0.606|0.681, AUC = 64.764|83.010]
Epoch   2/4: 100%|██████████| 63/63 [00:35<00:00,  1.76batch/s, Epoch = 0m 35s, Loss = 0.5737|0.5644, Acc = 0.762|0.710, AUC = 83.364|84.974]
Epoch   3/4: 100%|██████████| 63/63 [00:34<00:00,  1.82batch/s, Epoch = 0m 34s, Loss = 0.5279|0.4978, Acc = 0.764|0.780, AUC = 84.461|86.164]
Epoch   4/4: 100%|██████████| 63/63 [00:35<00:00,  1.76batch/s, Epoch = 0m 35s, Loss = 0.4869|0.4881, Acc = 0.806|0.774, AUC = 87.951|86.239]

TOTAL TIME = 276.00s
BEST ACC = 0.78% AT EPOCH 3
BEST AUC = 86.24 AT EPOCH 4



