# Artificial Neural Networks and Deep Learning

---

## Lecture 8a: Attention Is All You Need

<img src="https://drive.google.com/uc?export=view&id=16IF6WfD_-vJ25FHgfhEmTvzI6QGctzhQ" width="500"/>



## ⚙️ Import Libraries

In [1]:
# Set seed for reproducibility
seed = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import logging
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(seed)
random.seed(seed)

# Import TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl

# Set seed for TensorFlow
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

# Reduce TensorFlow verbosity
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Print TensorFlow version
print(tf.__version__)

# Import other libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

2.17.1


## 🛠️ Scaled Dot-Product Attention


<img src="https://drive.google.com/uc?export=view&id=1bHYwJ61EZQGebSYQqyzh5byKDDruGmls" width="350"/>

In [2]:
# Define batch size, sequence lengths, and model dimension
batch_size = 2
seq_len_q = 4  # Query sequence length
seq_len_kv = 6  # Keys and values sequence length
d_model = 8  # Model dimensionality

# Generate random tensors for Q, K, and V
Q = tf.random.normal((batch_size, seq_len_q, d_model))
K = tf.random.normal((batch_size, seq_len_kv, d_model))
V = tf.random.normal((batch_size, seq_len_kv, d_model))

# Print shapes of Q, K, and V
print(f"Q shape: {Q.shape}")
print(f"K shape: {K.shape}")
print(f"V shape: {V.shape}")

Q shape: (2, 4, 8)
K shape: (2, 6, 8)
V shape: (2, 6, 8)


**Scaled Dot-Product Attention - Cross-Attention**

<img src="https://drive.google.com/uc?export=view&id=1sD-Vx2-kwyU4Nq-wthWHkCfrMSyp-85g" width="600"/>


In [3]:
# Define cross-attention layer with scaling and dot-product score mode
cross_attn = tf.keras.layers.Attention(use_scale=True, score_mode='dot')

# Compute cross-attention output with Q as query, and K, V from another sequence
cross_output = cross_attn([Q, V, K])

# Print shape of the cross-attention output
print(f"Cross-Attention output shape: {cross_output.shape}")

Cross-Attention output shape: (2, 4, 8)


**Scaled Dot-Product Attention - Self-Attention**

In [4]:
# Define self-attention layer with scaling and dot-product score mode
self_attn = tf.keras.layers.Attention(use_scale=True, score_mode='dot')

# Compute self-attention output with Q used as query, keys, and values
self_output = self_attn([Q, Q, Q])

# Print shape of the self-attention output
print(f"Self-Attention output shape: {self_output.shape}")

Self-Attention output shape: (2, 4, 8)


## 🛠️ Multi-Head Attention


<img src="https://drive.google.com/uc?export=view&id=1SD0qnvFWCHRwop0TTWeQPzdLPKyFO2bD" width="350"/>

In [5]:
# Define number of attention heads and compute head dimension
num_heads = 4
head_dim = d_model // num_heads

# Print computed head dimension
print(f"Head dimension: {head_dim}")

Head dimension: 2


**Scaled Multi-Head Dot-Product Attention - Cross-Attention**

<img src="https://drive.google.com/uc?export=view&id=11R6PgMYOVs1bWsseDZhMq16mTBYaWkZ0" width="800"/>


In [6]:
# Define a multi-head attention layer with 2 heads and specified key dimension
mha = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=head_dim)

# Compute multi-head attention output and attention scores
output, attention_heads = mha(query=Q, value=V, key=K, return_attention_scores=True)

# Print shapes of the multi-head attention output and attention weights
print(f"Multi-Head Attention output shape: {output.shape}")
print(f"Multi-Head Attention weights shape: {attention_heads.shape}")

Multi-Head Attention output shape: (2, 4, 8)
Multi-Head Attention weights shape: (2, 2, 4, 6)


**Scaled Multi-Head Dot-Product Attention - Self-Attention**

In [7]:
# Define a multi-head attention layer with 2 heads and specified key dimension
mha = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=head_dim)

# Compute multi-head self-attention output and attention scores
output, attention_heads = mha(query=Q, value=Q, key=Q, return_attention_scores=True)

# Print shapes of the multi-head attention output and attention weights
print(f"Multi-Head Attention output shape: {output.shape}")
print(f"Multi-Head Attention weights shape: {attention_heads.shape}")

Multi-Head Attention output shape: (2, 4, 8)
Multi-Head Attention weights shape: (2, 2, 4, 4)
