In [1]:
import os
import sys

os.chdir("..")
sys.path.append(os.getcwd())
os.getcwd()

'/group/pmc026/nchoong/QuantumTransformer'

In [2]:
import random
import numpy as np
import torch
import tensorflow as tf

2024-09-23 23:04:29.978037: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 23:04:29.990926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-23 23:04:30.005852: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-23 23:04:30.010397: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-23 23:04:30.021576: I tensorflow/core/platform/cpu_feature_guar

In [3]:
seed = 42

os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ["TF_DETERMINISTIC_OPS"] = "1"
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)

os.environ["OMP_NUM_THREADS"] = str(os.cpu_count())
os.environ["TF_NUM_INTEROP_THREADS"] = str(os.cpu_count())
os.environ["TF_NUM_INTRAOP_THREADS"] = str(os.cpu_count())

tf.config.threading.set_intra_op_parallelism_threads(os.cpu_count())
tf.config.threading.set_inter_op_parallelism_threads(os.cpu_count())

# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

In [4]:
# Define the desired shape: (batch_size, sequence_length, embedding_dim)
batch_size = 32
sequence_length = 128
embedding_dim = 8

# Generate random word embeddings with the specified shape
word_embeddings = np.random.rand(batch_size, sequence_length, embedding_dim).astype(
    np.float32
)

# Print the shape to verify
print(f"Word embeddings shape: {word_embeddings.shape}")

Word embeddings shape: (32, 128, 8)


In [5]:
import math
import tensorflow as tf
from tensorflow.keras import layers


def attention_tf(
    query: tf.Tensor,
    key: tf.Tensor,
    value: tf.Tensor,
    mask: tf.Tensor | None = None,
    dropout: layers.Dropout | None = None,
):
    """Scaled Dot Product Attention"""
    dim_k = tf.cast(tf.shape(query)[-1], tf.float32)  # type: ignore
    # scaled = tf.matmul(query, key, transpose_b=True) / math.sqrt(dim_k)
    scaled = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(dim_k)
    if mask is not None:
        mask = tf.expand_dims(mask, 1)
        scaled = tf.where(mask == 0, -1e9, scaled)
    scores = tf.nn.softmax(scaled, axis=-1)
    if dropout is not None:
        scores = dropout(scores)
    # attn = tf.matmul(scores, value)
    attn = tf.matmul(scores, value)
    return attn, scores

In [6]:
from torch import Tensor
from torch.nn import Dropout


def attention_torch(
    query: Tensor,
    key: Tensor,
    value: Tensor,
    mask: Tensor | None = None,
    dropout: Dropout | None = None,
):
    """Scaled Dot Product Attention"""
    dim_k = query.size(-1)
    # scaled = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(dim_k)
    scaled = (query @ key.transpose(-2, -1)) / math.sqrt(dim_k)
    if mask is not None:
        mask = mask.unsqueeze(1)
        scaled = scaled.masked_fill(mask == 0, -1e9)
    scores = scaled.softmax(-1)
    if dropout is not None:
        scores: Tensor = dropout(scores)
    # attn = torch.matmul(scores, value)
    attn = scores @ value
    return attn, scores

In [7]:
batch_size = 32
sequence_length = 128
embedding_dim = 8
num_heads = 2
dim_k = embedding_dim // num_heads

word_embeddings = np.random.rand(batch_size, sequence_length, embedding_dim).astype(
    np.float32
)

print(f"Word embeddings shape: {word_embeddings.shape}")

Word embeddings shape: (32, 128, 8)


In [8]:
input_tf = tf.transpose(
    tf.reshape(word_embeddings, (batch_size, -1, num_heads, dim_k)),
    perm=(0, 2, 1, 3),
)
input_tf.shape

2024-09-23 23:04:36.094387: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13968 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
2024-09-23 23:04:36.094877: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30828 MB memory:  -> device: 1, name: Tesla V100-PCIE-32GB, pci bus id: 0000:d8:00.0, compute capability: 7.0


TensorShape([32, 2, 128, 4])

In [9]:
input_torch = (
    torch.from_numpy(word_embeddings)
    .view(batch_size, -1, num_heads, dim_k)
    .transpose(1, 2)
)
input_torch.shape

torch.Size([32, 2, 128, 4])

In [10]:
attn_tf, scores_tf = attention_tf(input_tf, input_tf, input_tf)

In [11]:
attn_torch, scores_torch = attention_torch(input_torch, input_torch, input_torch)

In [12]:
# Convert PyTorch outputs to NumPy
attn_torch_np = attn_torch.detach().cpu().numpy()
scores_torch_np = scores_torch.detach().cpu().numpy()

# Convert TensorFlow outputs to NumPy
attn_tf_np = attn_tf.numpy()
scores_tf_np = scores_tf.numpy()

# Compare the attention outputs and scores
# Check if they are approximately equal (allowing for small floating-point differences)
attn_close = np.allclose(attn_tf_np, attn_torch_np, atol=1e-6)
scores_close = np.allclose(scores_tf_np, scores_torch_np, atol=1e-6)

# Output the comparison results
print(f"Are the attention outputs close? {attn_close}")
print(f"Are the attention scores close? {scores_close}")

# Alternatively, you can print absolute differences
print(
    f"Max absolute difference in attention outputs: {np.max(np.abs(attn_tf_np - attn_torch_np))}"
)
print(
    f"Max absolute difference in attention scores: {np.max(np.abs(scores_tf_np - scores_torch_np))}"
)

Are the attention outputs close? True
Are the attention scores close? True
Max absolute difference in attention outputs: 2.384185791015625e-07
Max absolute difference in attention scores: 3.725290298461914e-09
