<a href="https://colab.research.google.com/github/Guptaraj06/Transformer-/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade --force-reinstall BPEmb

Collecting BPEmb
  Using cached bpemb-0.3.6-py3-none-any.whl.metadata (19 kB)
Collecting gensim (from BPEmb)
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy (from BPEmb)
  Downloading numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests (from BPEmb)
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting sentencepiece (from BPEmb)
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting tqdm (from BPEmb)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy (from BPEmb)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x8

In [4]:
import math
import numpy as np
import tensorflow as tf

from bpemb import BPEmb

In [5]:
def scaled_dot_product_attention(query, key, value, mask=None):
  key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
  scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)

  if mask is not None:
    scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)

  softmax = tf.keras.layers.Softmax()
  weights = softmax(scaled_scores)
  return tf.matmul(weights, value), weights

In [6]:
seq_len = 3
embed_dim = 4

queries = np.random.rand(seq_len, embed_dim)
keys = np.random.rand(seq_len, embed_dim)
values = np.random.rand(seq_len, embed_dim)

print("Queries:\n", queries)

Queries:
 [[0.07244374 0.6554847  0.99981747 0.09143517]
 [0.6292453  0.99194708 0.91442838 0.29080987]
 [0.71472839 0.38656586 0.42850067 0.53985445]]


In [7]:
output, attn_weights = scaled_dot_product_attention(queries, keys, values)

print("Output\n", output, "\n")
print("Weights\n", attn_weights)

Output
 tf.Tensor(
[[0.6232728  0.49103945 0.3424313  0.46579868]
 [0.62743866 0.4887352  0.34538543 0.45825282]
 [0.63015544 0.49025863 0.34769306 0.45300466]], shape=(3, 4), dtype=float32) 

Weights
 tf.Tensor(
[[0.32080856 0.3816126  0.2975788 ]
 [0.31102267 0.37526822 0.3137091 ]
 [0.31007025 0.36863434 0.3212954 ]], shape=(3, 3), dtype=float32)


In [8]:
batch_size = 1
seq_len = 3
embed_dim = 12
num_heads = 3
head_dim = embed_dim // num_heads

print(f"Dimension of each head: {head_dim}")

Dimension of each head: 4


In [9]:
x = np.random.rand(batch_size, seq_len, embed_dim).round(1)
print("Input shape: ", x.shape, "\n")
print("Input:\n", x)

Input shape:  (1, 3, 12) 

Input:
 [[[0.7 0.4 0.2 0.6 0.4 0.9 0.6 0.2 0.8 1.  0.2 0.6]
  [0.7 0.2 0.6 0.9 0.8 0.4 0.5 0.9 0.5 0.9 0.5 0.1]
  [0.1 0.6 0.4 0.1 0.3 0.2 0.6 0.5 0.6 0.  0.7 0.5]]]


In [10]:
# The query weights for each head.
wq0 = np.random.rand(embed_dim, head_dim).round(1)
wq1 = np.random.rand(embed_dim, head_dim).round(1)
wq2 = np.random.rand(embed_dim, head_dim).round(1)

# The key weights for each head.
wk0 = np.random.rand(embed_dim, head_dim).round(1)
wk1 = np.random.rand(embed_dim, head_dim).round(1)
wk2 = np.random.rand(embed_dim, head_dim).round(1)

# The value weights for each head.
wv0 = np.random.rand(embed_dim, head_dim).round(1)
wv1 = np.random.rand(embed_dim, head_dim).round(1)
wv2 = np.random.rand(embed_dim, head_dim).round(1)

In [11]:
print("The three sets of query weights (one for each head):")
print("wq0:\n", wq0)
print("wq1:\n", wq1)
print("wq2:\n", wq1)

The three sets of query weights (one for each head):
wq0:
 [[0.2 0.7 0.8 0.2]
 [0.6 0.  0.7 0.4]
 [0.1 0.9 0.5 0.9]
 [0.  0.1 0.6 0.4]
 [0.1 0.6 0.9 0.9]
 [0.5 0.9 0.4 0.4]
 [0.4 0.6 0.1 0.3]
 [0.9 0.  0.5 0.6]
 [1.  0.6 0.3 0.2]
 [0.5 0.9 0.1 0.1]
 [0.5 0.8 0.2 0. ]
 [0.1 0.6 0.9 0.6]]
wq1:
 [[0.  0.7 0.1 0.2]
 [1.  0.5 0.2 0.5]
 [0.4 0.8 0.9 0.8]
 [1.  0.9 0.6 0.9]
 [0.5 0.  0.6 0.4]
 [0.6 0.5 1.  0.1]
 [0.6 0.6 0.4 0.3]
 [0.6 1.  0.3 0. ]
 [0.9 0.6 0.2 0.7]
 [0.  0.9 0.9 1. ]
 [0.5 0.7 0.7 0.8]
 [0.7 1.  0.8 0.8]]
wq2:
 [[0.  0.7 0.1 0.2]
 [1.  0.5 0.2 0.5]
 [0.4 0.8 0.9 0.8]
 [1.  0.9 0.6 0.9]
 [0.5 0.  0.6 0.4]
 [0.6 0.5 1.  0.1]
 [0.6 0.6 0.4 0.3]
 [0.6 1.  0.3 0. ]
 [0.9 0.6 0.2 0.7]
 [0.  0.9 0.9 1. ]
 [0.5 0.7 0.7 0.8]
 [0.7 1.  0.8 0.8]]


In [12]:
# Geneated queries, keys, and values for the first head.
q0 = np.dot(x, wq0)
k0 = np.dot(x, wk0)
v0 = np.dot(x, wv0)

# Geneated queries, keys, and values for the second head.
q1 = np.dot(x, wq1)
k1 = np.dot(x, wk1)
v1 = np.dot(x, wv1)

# Geneated queries, keys, and values for the third head.
q2 = np.dot(x, wq2)
k2 = np.dot(x, wk2)
v2 = np.dot(x, wv2)

In [13]:
print("Q, K, and V for first head:\n")

print(f"q0 {q0.shape}:\n", q0, "\n")
print(f"k0 {k0.shape}:\n", k0, "\n")
print(f"v0 {v0.shape}:\n", v0)

Q, K, and V for first head:

q0 (1, 3, 4):
 [[[2.77 4.04 3.1  2.36]
  [2.82 3.83 3.35 2.94]
  [2.24 2.38 2.19 1.91]]] 

k0 (1, 3, 4):
 [[[2.96 3.28 3.49 3.3 ]
  [3.32 3.53 3.28 3.97]
  [2.35 2.01 2.1  2.37]]] 

v0 (1, 3, 4):
 [[[4.05 3.09 4.18 1.69]
  [4.58 3.55 4.11 1.91]
  [3.18 2.73 3.44 1.25]]]


In [14]:
out0, attn_weights0 = scaled_dot_product_attention(q0, k0, v0)

print("Output from first attention head: ", out0, "\n")
print("Attention weights from first head: ", attn_weights0)

Output from first attention head:  tf.Tensor(
[[[4.4805045 3.4637399 4.1229367 1.8686811]
  [4.4922047 3.4738603 4.121465  1.8735445]
  [4.4455504 3.434285  4.1256437 1.8539957]]], shape=(1, 3, 4), dtype=float32) 

Attention weights from first head:  tf.Tensor(
[[[1.8709546e-01 8.1266534e-01 2.3918200e-04]
  [1.6524872e-01 8.3459866e-01 1.5258271e-04]
  [2.4714653e-01 7.5038081e-01 2.4726272e-03]]], shape=(1, 3, 3), dtype=float32)


In [15]:
out1, _ = scaled_dot_product_attention(q1, k1, v1)
out2, _ = scaled_dot_product_attention(q2, k2, v2)

print("Output from second attention head: ", out1, "\n")
print("Output from third attention head: ", out2,)

Output from second attention head:  tf.Tensor(
[[[4.186841  2.5665944 1.9727039 3.4143689]
  [4.1881313 2.5652971 1.9725096 3.4149556]
  [4.1624804 2.5839424 1.9748427 3.4025478]]], shape=(1, 3, 4), dtype=float32) 

Output from third attention head:  tf.Tensor(
[[[3.7698596 3.3104713 3.7910852 3.2488465]
  [3.7704666 3.3108099 3.7911572 3.2480586]
  [3.759783  3.3036513 3.78759   3.2570388]]], shape=(1, 3, 4), dtype=float32)


In [16]:
combined_out_a = np.concatenate((out0, out1, out2), axis=-1)
print(f"Combined output from all heads {combined_out_a.shape}:")
print(combined_out_a)


Combined output from all heads (1, 3, 12):
[[[4.4805045 3.4637399 4.1229367 1.8686811 4.186841  2.5665944 1.9727039
   3.4143689 3.7698596 3.3104713 3.7910852 3.2488465]
  [4.4922047 3.4738603 4.121465  1.8735445 4.1881313 2.5652971 1.9725096
   3.4149556 3.7704666 3.3108099 3.7911572 3.2480586]
  [4.4455504 3.434285  4.1256437 1.8539957 4.1624804 2.5839424 1.9748427
   3.4025478 3.759783  3.3036513 3.78759   3.2570388]]]


In [17]:
print("Query weights for first head: \n", wq0, "\n")
print("Query weights for second head: \n", wq1, "\n")
print("Query weights for third head: \n", wq2)

Query weights for first head: 
 [[0.2 0.7 0.8 0.2]
 [0.6 0.  0.7 0.4]
 [0.1 0.9 0.5 0.9]
 [0.  0.1 0.6 0.4]
 [0.1 0.6 0.9 0.9]
 [0.5 0.9 0.4 0.4]
 [0.4 0.6 0.1 0.3]
 [0.9 0.  0.5 0.6]
 [1.  0.6 0.3 0.2]
 [0.5 0.9 0.1 0.1]
 [0.5 0.8 0.2 0. ]
 [0.1 0.6 0.9 0.6]] 

Query weights for second head: 
 [[0.  0.7 0.1 0.2]
 [1.  0.5 0.2 0.5]
 [0.4 0.8 0.9 0.8]
 [1.  0.9 0.6 0.9]
 [0.5 0.  0.6 0.4]
 [0.6 0.5 1.  0.1]
 [0.6 0.6 0.4 0.3]
 [0.6 1.  0.3 0. ]
 [0.9 0.6 0.2 0.7]
 [0.  0.9 0.9 1. ]
 [0.5 0.7 0.7 0.8]
 [0.7 1.  0.8 0.8]] 

Query weights for third head: 
 [[0.3 0.9 0.4 0.5]
 [0.3 0.3 0.6 0.1]
 [0.3 0.  0.  0.1]
 [0.2 0.2 0.2 0.7]
 [0.4 0.1 0.9 0.7]
 [0.5 0.5 0.4 0.2]
 [0.1 0.4 0.1 0.9]
 [0.7 0.3 0.2 0.1]
 [0.4 0.2 0.9 0.8]
 [0.8 0.6 0.1 0.6]
 [0.1 0.5 0.8 0.3]
 [0.3 0.8 1.  0.6]]


In [18]:
wq = np.concatenate((wq0, wq1, wq2), axis=1)
print(f"Single query weight matrix {wq.shape}: \n", wq)

Single query weight matrix (12, 12): 
 [[0.2 0.7 0.8 0.2 0.  0.7 0.1 0.2 0.3 0.9 0.4 0.5]
 [0.6 0.  0.7 0.4 1.  0.5 0.2 0.5 0.3 0.3 0.6 0.1]
 [0.1 0.9 0.5 0.9 0.4 0.8 0.9 0.8 0.3 0.  0.  0.1]
 [0.  0.1 0.6 0.4 1.  0.9 0.6 0.9 0.2 0.2 0.2 0.7]
 [0.1 0.6 0.9 0.9 0.5 0.  0.6 0.4 0.4 0.1 0.9 0.7]
 [0.5 0.9 0.4 0.4 0.6 0.5 1.  0.1 0.5 0.5 0.4 0.2]
 [0.4 0.6 0.1 0.3 0.6 0.6 0.4 0.3 0.1 0.4 0.1 0.9]
 [0.9 0.  0.5 0.6 0.6 1.  0.3 0.  0.7 0.3 0.2 0.1]
 [1.  0.6 0.3 0.2 0.9 0.6 0.2 0.7 0.4 0.2 0.9 0.8]
 [0.5 0.9 0.1 0.1 0.  0.9 0.9 1.  0.8 0.6 0.1 0.6]
 [0.5 0.8 0.2 0.  0.5 0.7 0.7 0.8 0.1 0.5 0.8 0.3]
 [0.1 0.6 0.9 0.6 0.7 1.  0.8 0.8 0.3 0.8 1.  0.6]]


In [19]:
wk = np.concatenate((wk0, wk1, wk2), axis=1)
wv = np.concatenate((wv0, wv1, wv2), axis=1)

print(f"Single key weight matrix {wk.shape}:\n", wk, "\n")
print(f"Single value weight matrix {wv.shape}:\n", wv)

Single key weight matrix (12, 12):
 [[0.1 0.8 0.1 0.3 0.6 0.2 0.2 0.1 0.7 1.  0.9 1. ]
 [0.2 0.6 0.4 0.2 0.3 0.5 0.4 0.8 0.  0.6 0.7 0.2]
 [0.2 1.  0.8 0.4 0.8 0.7 0.2 0.4 0.1 0.9 0.2 0.8]
 [0.7 0.1 0.3 1.  0.5 0.6 0.6 0.6 0.6 0.3 0.5 0.7]
 [0.9 0.9 0.8 0.9 0.3 0.9 0.4 0.6 0.5 0.3 0.7 0.7]
 [0.5 0.  0.8 0.5 0.9 0.5 0.6 0.1 0.9 0.9 0.4 0.4]
 [0.5 0.3 0.3 0.9 0.  0.4 0.4 0.7 0.9 1.  0.1 0.4]
 [0.3 0.  0.1 0.3 0.6 0.1 0.9 0.2 0.6 0.  1.  0.5]
 [0.7 0.7 0.8 0.2 0.1 0.9 0.7 0.2 0.2 0.5 0.9 0.3]
 [0.2 0.9 0.7 0.4 0.1 0.3 0.2 0.4 0.1 0.1 0.8 0.8]
 [0.9 0.2 0.2 0.9 0.9 0.8 0.6 0.9 0.3 0.2 1.  0.3]
 [0.4 0.3 0.5 0.3 0.1 0.3 0.1 0.8 0.3 0.4 0.9 0.8]] 

Single value weight matrix (12, 12):
 [[0.6 1.  0.5 0.5 0.3 0.5 0.9 0.3 0.9 0.6 1.  0.1]
 [0.3 0.5 0.5 0.2 0.2 0.7 0.7 0.7 0.6 0.8 0.5 0.4]
 [0.7 0.6 0.6 0.8 0.2 0.  0.3 0.1 0.8 0.4 0.4 0.6]
 [0.8 0.4 0.2 0.  0.8 0.3 0.2 1.  0.6 0.2 0.9 0.5]
 [0.5 0.  0.7 0.5 0.3 0.5 0.1 0.6 0.2 0.3 0.9 0.3]
 [0.9 0.1 0.9 0.6 0.7 0.9 0.  0.4 1.  0.4 0.5 0.6]
 [1. 

In [20]:
q_s = np.dot(x, wq)
k_s = np.dot(x, wk)
v_s = np.dot(x, wv)

In [21]:
print(f"Query vectors using a single weight matrix {q_s.shape}:\n", q_s)

Query vectors using a single weight matrix (1, 3, 12):
 [[[2.77 4.04 3.1  2.36 3.54 4.52 3.81 3.67 2.64 3.   3.04 3.51]
  [2.82 3.83 3.35 2.94 3.59 4.84 3.88 3.77 2.83 2.59 2.73 3.39]
  [2.24 2.38 2.19 1.91 3.03 3.09 2.33 2.43 1.44 1.68 2.53 2.05]]]


In [22]:
print(q0, "\n")
print(q1, "\n")
print(q2)

[[[2.77 4.04 3.1  2.36]
  [2.82 3.83 3.35 2.94]
  [2.24 2.38 2.19 1.91]]] 

[[[3.54 4.52 3.81 3.67]
  [3.59 4.84 3.88 3.77]
  [3.03 3.09 2.33 2.43]]] 

[[[2.64 3.   3.04 3.51]
  [2.83 2.59 2.73 3.39]
  [1.44 1.68 2.53 2.05]]]


In [23]:
q_s_reshaped = tf.reshape(q_s, (batch_size, seq_len, num_heads, head_dim))
print(f"Combined queries: {q_s.shape}\n", q_s, "\n")
print(f"Reshaped into separate heads: {q_s_reshaped.shape}\n", q_s_reshaped)

Combined queries: (1, 3, 12)
 [[[2.77 4.04 3.1  2.36 3.54 4.52 3.81 3.67 2.64 3.   3.04 3.51]
  [2.82 3.83 3.35 2.94 3.59 4.84 3.88 3.77 2.83 2.59 2.73 3.39]
  [2.24 2.38 2.19 1.91 3.03 3.09 2.33 2.43 1.44 1.68 2.53 2.05]]] 

Reshaped into separate heads: (1, 3, 3, 4)
 tf.Tensor(
[[[[2.77 4.04 3.1  2.36]
   [3.54 4.52 3.81 3.67]
   [2.64 3.   3.04 3.51]]

  [[2.82 3.83 3.35 2.94]
   [3.59 4.84 3.88 3.77]
   [2.83 2.59 2.73 3.39]]

  [[2.24 2.38 2.19 1.91]
   [3.03 3.09 2.33 2.43]
   [1.44 1.68 2.53 2.05]]]], shape=(1, 3, 3, 4), dtype=float64)


In [24]:
q_s_transposed = tf.transpose(q_s_reshaped, perm=[0, 2, 1, 3]).numpy()
print(f"Queries transposed into \"separate\" heads {q_s_transposed.shape}:\n",
      q_s_transposed)

Queries transposed into "separate" heads (1, 3, 3, 4):
 [[[[2.77 4.04 3.1  2.36]
   [2.82 3.83 3.35 2.94]
   [2.24 2.38 2.19 1.91]]

  [[3.54 4.52 3.81 3.67]
   [3.59 4.84 3.88 3.77]
   [3.03 3.09 2.33 2.43]]

  [[2.64 3.   3.04 3.51]
   [2.83 2.59 2.73 3.39]
   [1.44 1.68 2.53 2.05]]]]


In [25]:
print("The separate per-head query matrices from before: ")
print(q0, "\n")
print(q1, "\n")
print(q2)

The separate per-head query matrices from before: 
[[[2.77 4.04 3.1  2.36]
  [2.82 3.83 3.35 2.94]
  [2.24 2.38 2.19 1.91]]] 

[[[3.54 4.52 3.81 3.67]
  [3.59 4.84 3.88 3.77]
  [3.03 3.09 2.33 2.43]]] 

[[[2.64 3.   3.04 3.51]
  [2.83 2.59 2.73 3.39]
  [1.44 1.68 2.53 2.05]]]


In [26]:
k_s_transposed = tf.transpose(tf.reshape(k_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()
v_s_transposed = tf.transpose(tf.reshape(v_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()

print(f"Keys for all heads in a single matrix {k_s.shape}: \n", k_s_transposed, "\n")
print(f"Values for all heads in a single matrix {v_s.shape}: \n", v_s_transposed)

Keys for all heads in a single matrix (1, 3, 12): 
 [[[[2.96 3.28 3.49 3.3 ]
   [3.32 3.53 3.28 3.97]
   [2.35 2.01 2.1  2.37]]

  [[2.47 3.27 2.76 2.84]
   [3.15 3.56 3.29 3.05]
   [1.92 2.57 2.22 2.58]]

  [[3.04 3.61 4.41 3.92]
   [3.21 3.21 4.77 4.32]
   [1.82 2.36 3.18 2.18]]]] 

Values for all heads in a single matrix (1, 3, 12): 
 [[[[4.05 3.09 4.18 1.69]
   [4.58 3.55 4.11 1.91]
   [3.18 2.73 3.44 1.25]]

  [[3.88 3.02 2.05 3.29]
   [4.2  2.55 1.97 3.42]
   [2.22 1.19 1.55 2.17]]

  [[3.65 3.24 3.77 3.39]
   [3.82 3.34 3.8  3.19]
   [2.64 2.12 2.58 2.42]]]]


In [27]:
all_heads_output, all_attn_weights = scaled_dot_product_attention(q_s_transposed,
                                                                  k_s_transposed,
                                                                  v_s_transposed)
print("Self attention output:\n", all_heads_output)

Self attention output:
 tf.Tensor(
[[[[4.4805045 3.4637399 4.1229367 1.8686811]
   [4.4922047 3.4738603 4.121465  1.8735445]
   [4.4455504 3.434285  4.1256437 1.8539957]]

  [[4.186841  2.5665944 1.9727039 3.4143689]
   [4.1881313 2.5652971 1.9725096 3.4149556]
   [4.1624804 2.5839424 1.9748427 3.4025478]]

  [[3.7698596 3.3104713 3.7910852 3.2488465]
   [3.7704666 3.3108099 3.7911572 3.2480586]
   [3.759783  3.3036513 3.78759   3.2570388]]]], shape=(1, 3, 3, 4), dtype=float32)


In [28]:
print("Per head outputs from using separate sets of weights per head:")
print(out0, "\n")
print(out1, "\n")
print(out2)

Per head outputs from using separate sets of weights per head:
tf.Tensor(
[[[4.4805045 3.4637399 4.1229367 1.8686811]
  [4.4922047 3.4738603 4.121465  1.8735445]
  [4.4455504 3.434285  4.1256437 1.8539957]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[4.186841  2.5665944 1.9727039 3.4143689]
  [4.1881313 2.5652971 1.9725096 3.4149556]
  [4.1624804 2.5839424 1.9748427 3.4025478]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[3.7698596 3.3104713 3.7910852 3.2488465]
  [3.7704666 3.3108099 3.7911572 3.2480586]
  [3.759783  3.3036513 3.78759   3.2570388]]], shape=(1, 3, 4), dtype=float32)


In [29]:
combined_out_b = tf.reshape(tf.transpose(all_heads_output, perm=[0, 2, 1, 3]),
                            shape=(batch_size, seq_len, embed_dim))
print("Final output from using single query, key, value matrices:\n",
      combined_out_b, "\n")
print("Final output from using separate query, key, value matrices per head:\n",
      combined_out_a)

Final output from using single query, key, value matrices:
 tf.Tensor(
[[[4.4805045 3.4637399 4.1229367 1.8686811 4.186841  2.5665944 1.9727039
   3.4143689 3.7698596 3.3104713 3.7910852 3.2488465]
  [4.4922047 3.4738603 4.121465  1.8735445 4.1881313 2.5652971 1.9725096
   3.4149556 3.7704666 3.3108099 3.7911572 3.2480586]
  [4.4455504 3.434285  4.1256437 1.8539957 4.1624804 2.5839424 1.9748427
   3.4025478 3.759783  3.3036513 3.78759   3.2570388]]], shape=(1, 3, 12), dtype=float32) 

Final output from using separate query, key, value matrices per head:
 [[[4.4805045 3.4637399 4.1229367 1.8686811 4.186841  2.5665944 1.9727039
   3.4143689 3.7698596 3.3104713 3.7910852 3.2488465]
  [4.4922047 3.4738603 4.121465  1.8735445 4.1881313 2.5652971 1.9725096
   3.4149556 3.7704666 3.3108099 3.7911572 3.2480586]
  [4.4455504 3.434285  4.1256437 1.8539957 4.1624804 2.5839424 1.9748427
   3.4025478 3.759783  3.3036513 3.78759   3.2570388]]]


In [30]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadSelfAttention, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads

    self.d_head = self.d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(self.d_model)
    self.wk = tf.keras.layers.Dense(self.d_model)
    self.wv = tf.keras.layers.Dense(self.d_model)

    # Linear layer to generate the final output.
    self.dense = tf.keras.layers.Dense(self.d_model)

  def split_heads(self, x):
    batch_size = x.shape[0]

    split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_head))
    return tf.transpose(split_inputs, perm=[0, 2, 1, 3])

  def merge_heads(self, x):
    batch_size = x.shape[0]

    merged_inputs = tf.transpose(x, perm=[0, 2, 1, 3])
    return tf.reshape(merged_inputs, (batch_size, -1, self.d_model))

  def call(self, q, k, v, mask):
    qs = self.wq(q)
    ks = self.wk(k)
    vs = self.wv(v)

    qs = self.split_heads(qs)
    ks = self.split_heads(ks)
    vs = self.split_heads(vs)

    output, attn_weights = scaled_dot_product_attention(qs, ks, vs, mask)
    output = self.merge_heads(output)

    return self.dense(output), attn_weights


In [31]:
mhsa = MultiHeadSelfAttention(12, 3)

output, attn_weights = mhsa(x, x, x, None)
print(f"MHSA output{output.shape}:")
print(output)

MHSA output(1, 3, 12):
tf.Tensor(
[[[-0.8302276   0.9981551  -0.6166465   0.41377932  0.20204026
   -0.6003253  -0.14570332  0.5368283  -0.04216018 -0.43120876
    0.05330622  0.40257525]
  [-0.8447752   0.97126204 -0.64132744  0.37059593  0.26110017
   -0.5871915  -0.17058638  0.5650949  -0.07390344 -0.40180212
   -0.04957867  0.3991276 ]
  [-0.818789    0.9645257  -0.6341903   0.42398366  0.23524301
   -0.59173477 -0.15234709  0.5626024  -0.03205955 -0.41448346
    0.03488266  0.38659552]]], shape=(1, 3, 12), dtype=float32)


**ENCODER BLOCK**

We can now build our **Encoder Block**. In addition to the **Multi-Head Self Attention** layer, the **Encoder Block** also has **skip connections**, **layer normalization steps**, and a **two-layer feed-forward neural network**. The original **Attention Is All You Need** paper also included some **dropout** applied to the self-attention output which isn't shown in the illustration below (see references for a link to the paper).

<div>
<img src="https://drive.google.com/uc?export=view&id=1D8sLDyQMqqhCjHWOn-I7rZKHugWxFyLy" width="500"/>
</div

In [32]:
def feed_forward_network(d_model, hidden_dim):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(hidden_dim, activation='relu'),
      tf.keras.layers.Dense(d_model)
  ])

In [33]:
class EncoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(EncoderBlock, self).__init__()

    self.mhsa = MultiHeadSelfAttention(d_model, num_heads)
    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()

  def call(self, x, training, mask):
    mhsa_output, attn_weights = self.mhsa(x, x, x, mask)
    mhsa_output = self.dropout1(mhsa_output, training=training)
    mhsa_output = self.layernorm1(x + mhsa_output)

    ffn_output = self.ffn(mhsa_output)
    ffn_output = self.dropout2(ffn_output, training=training)
    output = self.layernorm2(mhsa_output + ffn_output)

    return output, attn_weights

In [35]:
encoder_block = EncoderBlock(12, 3, 48)

block_output,  _ = encoder_block(x, training=True, mask=None)
print(f"Output from single encoder block {block_output.shape}:")
print(block_output)

Output from single encoder block (1, 3, 12):
tf.Tensor(
[[[ 0.5412606   1.4963095  -0.31285113  0.3370783   1.953231
    0.49531856 -0.7344841   0.24173859 -0.66327107 -1.3468627
   -0.6039763  -1.4034917 ]
  [ 1.2324975   0.8439106   0.34190533  0.69966465  1.4791296
   -0.2947243  -1.2980573   0.7729651  -1.2482455  -1.376853
   -0.08593917 -1.0662539 ]
  [-0.7572444   1.9571755  -0.0692152  -0.22665036  1.5720595
   -0.3995145  -0.06446893  0.15949897 -0.77617055 -1.9247897
    0.72583765 -0.19651775]]], shape=(1, 3, 12), dtype=float32)


## Word and Positional Embeddings


In [36]:
# Load the English tokenizer.
bpemb_en = BPEmb(lang="en")

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 938540.63B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:01<00:00, 3769898.38B/s]


In [37]:
bpemb_vocab_size, bpemb_embed_size = bpemb_en.vectors.shape
print("Vocabulary size:", bpemb_vocab_size)
print("Embedding size:", bpemb_embed_size)

Vocabulary size: 10000
Embedding size: 100


In [38]:
bpemb_en.vectors[bpemb_en.words.index('car')]

array([-0.305548, -0.325598, -0.134716, -0.078735, -0.660545,  0.076211,
       -0.735487,  0.124533, -0.294402,  0.459688,  0.030137,  0.174041,
       -0.224223,  0.486189, -0.504649, -0.459699,  0.315747,  0.477885,
        0.091398,  0.427867,  0.016524, -0.076833, -0.899727,  0.493158,
       -0.022309, -0.422785, -0.154148,  0.204981,  0.379834,  0.070588,
        0.196073, -0.368222,  0.473406,  0.007409,  0.004303, -0.007823,
       -0.19103 , -0.202509,  0.109878, -0.224521, -0.35741 , -0.611633,
        0.329958, -0.212956, -0.497499, -0.393839, -0.130101, -0.216903,
       -0.105595, -0.076007, -0.483942, -0.139704, -0.161647,  0.136985,
        0.415363, -0.360143,  0.038601, -0.078804, -0.030421,  0.324129,
        0.223378, -0.523636, -0.048317, -0.032248, -0.117367,  0.470519,
        0.225816, -0.222065, -0.225007, -0.165904, -0.334389, -0.20157 ,
        0.572352, -0.268794,  0.301929, -0.005563,  0.387491,  0.261031,
       -0.11613 ,  0.074982, -0.008433,  0.259987, 

In [39]:
sample_sentence = "Where can I find a pizzeria?"
tokens = bpemb_en.encode(sample_sentence)
print(tokens)

['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?']


In [40]:
token_seq = np.array(bpemb_en.encode_ids("Where can I find a pizzeria?"))
print(token_seq)

[ 571  280  386 1934    4   24  248 4339  177 9967]


In [41]:
token_embed = tf.keras.layers.Embedding(bpemb_vocab_size, embed_dim)
token_embeddings = token_embed(token_seq)

# The untrained embeddings for our sample sentence.
print("Embeddings for: ", sample_sentence)
print(token_embeddings)

Embeddings for:  Where can I find a pizzeria?
tf.Tensor(
[[-0.01450543  0.02134693 -0.0036183   0.00598484 -0.02785349 -0.0174439
   0.04653895  0.0369211   0.04923384  0.04816181  0.02228737 -0.02763956]
 [ 0.0414459  -0.01461869 -0.0191421   0.01305517  0.01281961  0.00851631
   0.00775314  0.03155413  0.00808637  0.03766078 -0.00080983 -0.01772932]
 [ 0.04571203  0.03214586  0.01618027  0.04937996  0.0201246   0.01081378
   0.00266125  0.00278854  0.00183207 -0.03170947 -0.02023299  0.03215679]
 [-0.02044579 -0.01229275 -0.00904544 -0.0004694   0.04278183  0.02019379
  -0.04377179 -0.03941356 -0.00212886  0.02343692  0.03784374  0.02503392]
 [ 0.01013063 -0.03259107  0.02430408  0.03284741  0.00524556 -0.03168879
   0.02142875  0.04988154 -0.04117832  0.02532664  0.01979554 -0.02448815]
 [ 0.02836113 -0.01666158 -0.04846765  0.00491304  0.02317781  0.03980999
  -0.02516168  0.00228211  0.01217722 -0.02781607  0.00016345  0.02943721]
 [-0.02624356 -0.03693829  0.01345748  0.00036063 

In [42]:
max_seq_len = 256
pos_embed = tf.keras.layers.Embedding(max_seq_len, embed_dim)

# Generate ids for each position of the token sequence.
pos_idx = tf.range(len(token_seq))
print(pos_idx)

tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)


In [43]:
# These are our positon embeddings.
position_embeddings = pos_embed(pos_idx)
print("Position embeddings for the input sequence\n", position_embeddings)

Position embeddings for the input sequence
 tf.Tensor(
[[-0.00831629  0.01386498  0.01974447 -0.03126182 -0.04692385 -0.00145738
   0.04558631 -0.01588348 -0.00100477 -0.03857248  0.01800474 -0.02239295]
 [-0.02437632 -0.01135335 -0.04522233  0.03176799 -0.02202369 -0.04102746
  -0.0046704  -0.0480051  -0.0365089  -0.02792769 -0.02262238  0.03269905]
 [ 0.01998294 -0.00043358  0.00814551 -0.00569183 -0.01230292 -0.00158697
   0.01354008  0.03389737  0.01760871 -0.02270676  0.04798969  0.02421117]
 [-0.008735    0.00570544 -0.01366899  0.0491447   0.04351444 -0.00563989
   0.01414985  0.03515251 -0.04457539  0.00775838  0.00308517  0.03688708]
 [ 0.02519131 -0.0375204  -0.01382179  0.03516263 -0.01076392  0.03078505
   0.03823545 -0.04239638 -0.02002363  0.01262202  0.03886297 -0.01305878]
 [ 0.03360437 -0.01538021 -0.02679813  0.03326312 -0.02849556 -0.0237576
   0.03702505 -0.03466476 -0.01466482  0.00467448  0.02809245 -0.02087338]
 [ 0.036658    0.04760261  0.03292108  0.02568975  0

In [44]:
input = token_embeddings + position_embeddings
print("Input to the initial encoder block:\n", input)

Input to the initial encoder block:
 tf.Tensor(
[[-0.02282172  0.03521191  0.01612617 -0.02527698 -0.07477734 -0.01890128
   0.09212527  0.02103761  0.04822908  0.00958933  0.04029211 -0.05003251]
 [ 0.01706958 -0.02597205 -0.06436443  0.04482316 -0.00920408 -0.03251115
   0.00308274 -0.01645098 -0.02842252  0.0097331  -0.02343221  0.01496973]
 [ 0.06569497  0.03171229  0.02432578  0.04368813  0.00782168  0.00922681
   0.01620133  0.03668592  0.01944077 -0.05441623  0.0277567   0.05636796]
 [-0.02918079 -0.00658732 -0.02271444  0.0486753   0.08629628  0.0145539
  -0.02962194 -0.00426105 -0.04670426  0.03119529  0.04092891  0.06192101]
 [ 0.03532194 -0.07011147  0.01048229  0.06801004 -0.00551837 -0.00090374
   0.0596642   0.00748517 -0.06120196  0.03794867  0.05865851 -0.03754693]
 [ 0.0619655  -0.0320418  -0.07526578  0.03817616 -0.00531775  0.01605239
   0.01186336 -0.03238266 -0.0024876  -0.02314159  0.02825589  0.00856383]
 [ 0.01041444  0.01066432  0.04637856  0.02605039  0.028589

## Encoder

In [60]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, src_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(src_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    # The original Attention Is All You Need paper applied dropout to the
    # input before feeding it to the first encoder block.
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    # Create encoder blocks.
    self.blocks = [EncoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate)
    for _ in range(num_blocks)]

  def call(self, input, training, mask):
    token_embeds = self.token_embed(input)

    # Generate position indices for a batch of input sequences.
    num_pos = input.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, input.shape)
    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    # Run input through successive encoder blocks.
    for block in self.blocks:
      x, weights = block(x, training=training, mask=mask)

    return x, weights

In [61]:
# Batch of 3 sequences, each of length 10 (10 is also the
# maximum sequence length in this case).
seqs = np.random.randint(0, 10000, size=(3, 10))
print(seqs.shape)
print(seqs)

(3, 10)
[[9616 8059 8568 9802 5152 3024 7152  925 4310 6138]
 [4632  791   36 9996 3810 5654 6623 6111 7432  837]
 [6771 1181  841 2368 7707  486 8080 3942 1649 6962]]


In [62]:
pos_ids = np.resize(np.arange(seqs.shape[1]), seqs.shape[0] * seqs.shape[1])
print(pos_ids)

[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [63]:
pos_ids = np.reshape(pos_ids, (3, 10))
print(pos_ids.shape)
print(pos_ids)

(3, 10)
[[0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]]


In [64]:
pos_embed(pos_ids)

<tf.Tensor: shape=(3, 10, 12), dtype=float32, numpy=
array([[[-0.00831629,  0.01386498,  0.01974447, -0.03126182,
         -0.04692385, -0.00145738,  0.04558631, -0.01588348,
         -0.00100477, -0.03857248,  0.01800474, -0.02239295],
        [-0.02437632, -0.01135335, -0.04522233,  0.03176799,
         -0.02202369, -0.04102746, -0.0046704 , -0.0480051 ,
         -0.0365089 , -0.02792769, -0.02262238,  0.03269905],
        [ 0.01998294, -0.00043358,  0.00814551, -0.00569183,
         -0.01230292, -0.00158697,  0.01354008,  0.03389737,
          0.01760871, -0.02270676,  0.04798969,  0.02421117],
        [-0.008735  ,  0.00570544, -0.01366899,  0.0491447 ,
          0.04351444, -0.00563989,  0.01414985,  0.03515251,
         -0.04457539,  0.00775838,  0.00308517,  0.03688708],
        [ 0.02519131, -0.0375204 , -0.01382179,  0.03516263,
         -0.01076392,  0.03078505,  0.03823545, -0.04239638,
         -0.02002363,  0.01262202,  0.03886297, -0.01305878],
        [ 0.03360437, -0.01

In [65]:
input_batch = [
    "Where can I find a pizzeria?",
    "Mass hysteria over listeria.",
    "I ain't no circle back girl."
]

bpemb_en.encode(input_batch)

[['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?'],
 ['▁mass', '▁hy', 'ster', 'ia', '▁over', '▁l', 'ister', 'ia', '.'],
 ['▁i', '▁a', 'in', "'", 't', '▁no', '▁circle', '▁back', '▁girl', '.']]

In [66]:
input_seqs = bpemb_en.encode_ids(input_batch)
print("Vectorized inputs:")
input_seqs

Vectorized inputs:


[[571, 280, 386, 1934, 4, 24, 248, 4339, 177, 9967],
 [1535, 1354, 1238, 177, 380, 43, 871, 177, 9935],
 [386, 4, 6, 9937, 9915, 467, 5410, 810, 3692, 9935]]

In [67]:
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("Input to the encoder:")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Input to the encoder:
(3, 10)
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]]


In [68]:
enc_mask = tf.cast(tf.math.not_equal(padded_input_seqs, 0), tf.float32)
print("Input:")
print(padded_input_seqs, '\n')
print("Encoder mask:")
print(enc_mask)

Input:
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]] 

Encoder mask:
tf.Tensor(
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]], shape=(3, 10), dtype=float32)


In [69]:
enc_mask = enc_mask[:, tf.newaxis, tf.newaxis, :]
enc_mask

<tf.Tensor: shape=(3, 1, 1, 10), dtype=float32, numpy=
array([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]], dtype=float32)>

In [70]:
num_encoder_blocks = 6

# d_model is the embedding dimension used throughout.
d_model = 12

num_heads = 3

# Feed-forward network hidden dimension width.
ffn_hidden_dim = 48

src_vocab_size = bpemb_vocab_size
max_input_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(
    num_encoder_blocks,
    d_model,
    num_heads,
    ffn_hidden_dim,
    src_vocab_size,
    max_input_seq_len)

In [71]:
encoder_output, attn_weights = encoder(padded_input_seqs, training=True,
                                       mask=enc_mask)
print(f"Encoder output {encoder_output.shape}:")
print(encoder_output)

Encoder output (3, 10, 12):
tf.Tensor(
[[[ 1.579574   -0.30544603  0.88527215 -0.5167201   1.2202008
   -1.6248578  -0.20109388  1.0542231   0.2514424  -1.4914469
   -0.8854966   0.03434891]
  [ 1.1653876  -1.9663235   1.0152406  -0.3637954   0.970079
   -1.2362143  -0.27605236  1.5174892   0.12112702  0.01679114
   -0.85314906 -0.11057995]
  [ 0.88519496 -0.80046505  1.1427479   1.4972564   0.5691529
   -1.7658144  -0.80819696 -0.2315444   0.48929513 -1.4935248
   -0.10516031  0.62105864]
  [ 1.3145082   0.90779275  0.55838794 -0.51837224  1.5488446
   -1.5544333  -0.20626375  0.8324937  -0.91545236 -1.5380864
   -0.2909094  -0.13850951]
  [ 1.4221358   0.9467822   0.510895    0.42567846  0.79109794
   -2.222377   -0.50202125  0.3298453   0.37025556 -1.5192597
   -0.04519834 -0.5078336 ]
  [ 1.1638402   0.59028673  0.4857424   0.35755503  1.19897
   -2.171043    0.15823942  0.1809375   0.4292823  -1.8367918
   -0.26275283 -0.29426587]
  [ 1.4428108  -0.84888303  1.4092687  -0.45428964

## Decoder Block

Let's build the **Decoder Block**. Everything we did to create the


**encoder** block applies here. The major differences are that the **Decoder Block** has:
1. a **Multi-Head Cross-Attention** layer which uses the encoder's outputs as the keys and values.

2. an extra skip/residual connection along with an extra layer normalization step.

<div>
<img src="https://drive.google.com/uc?export=view&id=1WVT4SX49bnta4uscOTF4xrsxFI4PbPER" width="500"/>
</div

In [72]:
class DecoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(DecoderBlock, self).__init__()

    self.mhsa1 = MultiHeadSelfAttention(d_model, num_heads)
    self.mhsa2 = MultiHeadSelfAttention(d_model, num_heads)

    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
    self.layernorm3 = tf.keras.layers.LayerNormalization()

  # Note the decoder block takes two masks. One for the first MHSA, another
  # for the second MHSA.
  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    mhsa_output1, attn_weights = self.mhsa1(target, target, target, decoder_mask)
    mhsa_output1 = self.dropout1(mhsa_output1, training=training)
    mhsa_output1 = self.layernorm1(mhsa_output1 + target)

    mhsa_output2, attn_weights = self.mhsa2(mhsa_output1, encoder_output,
                                            encoder_output,
                                            memory_mask)
    mhsa_output2 = self.dropout2(mhsa_output2, training=training)
    mhsa_output2 = self.layernorm2(mhsa_output2 + mhsa_output1)

    ffn_output = self.ffn(mhsa_output2)
    ffn_output = self.dropout3(ffn_output, training=training)
    output = self.layernorm3(ffn_output + mhsa_output2)

    return output, attn_weights


In [79]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(target_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.blocks = [DecoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_blocks)]

  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    token_embeds = self.token_embed(target)

    # Generate position indices.
    num_pos = target.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, target.shape)

    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    for block in self.blocks:
      x, weights = block(encoder_output, x, training=training, decoder_mask=decoder_mask, memory_mask=memory_mask)

    return x, weights

In [80]:
target_input_seqs = [
    [1, 652, 723, 123, 62],
    [1, 25,  98, 129, 248, 215, 359, 249],
    [1, 2369, 1259, 125, 486],
]

In [81]:
padded_target_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(target_input_seqs, padding="post")
print("Padded target inputs to the decoder:")
print(padded_target_input_seqs.shape)
print(padded_target_input_seqs)

Padded target inputs to the decoder:
(3, 8)
[[   1  652  723  123   62    0    0    0]
 [   1   25   98  129  248  215  359  249]
 [   1 2369 1259  125  486    0    0    0]]


In [82]:
dec_padding_mask = tf.cast(tf.math.not_equal(padded_target_input_seqs, 0), tf.float32)
dec_padding_mask = dec_padding_mask[:, tf.newaxis, tf.newaxis, :]
print(dec_padding_mask)

tf.Tensor(
[[[[1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 1, 8), dtype=float32)


In [83]:
target_input_seq_len = padded_target_input_seqs.shape[1]
look_ahead_mask = tf.linalg.band_part(tf.ones((target_input_seq_len,
                                               target_input_seq_len)), -1, 0)
print(look_ahead_mask)

tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]], shape=(8, 8), dtype=float32)


In [86]:
dec_mask=tf.minimum(dec_padding_mask,look_ahead_mask)
print(dec_mask)

tf.Tensor(
[[[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 1. 0. 0.]
   [1. 1. 1. 1. 1. 1. 1. 0.]
   [1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 8, 8), dtype=float32)


In [87]:
decoder = Decoder(6, 12, 3, 48, 10000, 8)
decoder_output, _ = decoder(encoder_output, padded_target_input_seqs,
                            training=True, decoder_mask=dec_mask, memory_mask=enc_mask)
print(f"Decoder output {decoder_output.shape}:")
print(decoder_output)

Decoder output (3, 8, 12):
tf.Tensor(
[[[-1.6702657   0.50318956 -1.7592717  -0.17663857  1.2850494
    0.17610756  0.43790656  1.8036728   0.24107505 -0.16324279
    0.09073146 -0.7683135 ]
  [-1.1028441   0.5018534  -1.1249242  -1.3121111   1.1582274
    0.774656    0.7249354   1.1585803   1.326194   -0.86719567
   -0.1500223  -1.0873497 ]
  [-0.4065437  -1.2040002  -1.1366074  -1.1306007   1.665147
    1.1165656  -0.22346106  0.9750749   1.1427991   0.14694583
    0.23474857 -1.1800678 ]
  [-0.9682002  -0.5799076  -1.1677436  -0.8456731   1.4443595
    0.05230616  1.83266     0.7883547   1.0908618  -0.54366165
   -0.06788997 -1.0354663 ]
  [-0.38208938 -0.5260767  -1.321638   -0.8186425   1.5702235
    0.7827861   0.4987505   1.7258528   0.89270365 -0.87908864
   -0.5740098  -0.9687722 ]
  [-1.5140767  -0.96811736 -0.3958732  -1.3305569   1.6804911
    0.64796245 -0.15959553  0.849154    1.1508901   0.42508015
   -0.9850389   0.5996804 ]
  [-1.5847521  -0.8266078  -0.6012428  -0.437

In [88]:
class Transformer(tf.keras.Model):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
               target_vocab_size, max_input_len, max_target_len, dropout_rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
                           max_input_len, dropout_rate)

    self.decoder = Decoder(num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                           max_target_len, dropout_rate)
    self.output_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self,input_seqs,target_input_seqs,training,encoder_mask,decoder_mask,memory_mask):
    encoder_output, encoder_attn_weights= self.encoder(input_seqs,training=training,mask=encoder_mask)

    decoder_output,decoder_attn_weights=self.decoder(encoder_output,target_input_seqs,training=training,decoder_mask=decoder_mask,memory_mask=memory_mask)

    return self.output_layer(decoder_output),encoder_attn_weights,decoder_attn_weights

In [91]:
transformer = Transformer(
    num_blocks = 6,
    d_model = 12,
    num_heads = 3,
    hidden_dim = 48,
    source_vocab_size = bpemb_vocab_size,
    target_vocab_size = 7000, # made-up target vocab size.
    max_input_len = padded_input_seqs.shape[1],
    max_target_len = padded_target_input_seqs.shape[1])

transformer_output, _, _ = transformer(padded_input_seqs,
                                       padded_target_input_seqs, training=True,
                                       encoder_mask=enc_mask, decoder_mask=dec_mask, memory_mask=enc_mask)
print(f"Transformer output {transformer_output.shape}:")
print(transformer_output) # If training, we would use this output to calculate losses.

Transformer output (3, 8, 7000):
tf.Tensor(
[[[-0.04387865  0.06492878  0.0392907  ... -0.06061146  0.03721704
   -0.04008161]
  [-0.03774514  0.05464525  0.04878882 ... -0.05557742  0.05498162
   -0.04994591]
  [-0.02515491  0.03835443  0.04307129 ... -0.02905256  0.07288609
   -0.0472059 ]
  ...
  [-0.06107319  0.06165582  0.00880569 ... -0.01473581  0.00318836
   -0.0157192 ]
  [-0.04512459  0.0409537   0.05400342 ... -0.06426592  0.07990092
   -0.07139765]
  [-0.05350854  0.04925311  0.02843308 ... -0.07697161  0.05938713
   -0.03261292]]

 [[-0.0608999   0.03678098  0.04952832 ... -0.09436908  0.05637408
   -0.05985317]
  [-0.01823165 -0.00099825  0.01956463 ... -0.02864105  0.06659756
   -0.05953186]
  [-0.02172638  0.01994331  0.0479968  ... -0.05612668  0.08196847
   -0.06372195]
  ...
  [-0.01016637  0.00528086  0.03422501 ... -0.03032508  0.0841794
   -0.02878843]
  [-0.00797508 -0.00905145 -0.00170928 ... -0.02427763  0.0903115
   -0.02935256]
  [-0.05007612  0.02695168  0.0

# Pre-Training and Transfer Learning with Hugging Face and OpenAI

In [1]:
!pip install transformers
!pip install --upgrade datasets fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [2]:
import operator
import pandas as pd
import tensorflow as tf
import transformers

from datasets import load_dataset
from tensorflow import keras
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import TFAutoModelForQuestionAnswering

In [3]:
classifier = pipeline("text-classification")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [4]:
classifier("Alice was excited to go the island but it didn't live up to the hype.")

[{'label': 'NEGATIVE', 'score': 0.9993934631347656}]

In [5]:
classifier("Bob doesn't do well in group situations but he said it wasn't bad.")

[{'label': 'POSITIVE', 'score': 0.9946909546852112}]

In [6]:
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [7]:
text = """
Hans Niemann is launching a counterattack in his dispute with chess world
champion Magnus Carlsen, filing a federal lawsuit that accuses Carlsen of
maliciously colluding with others to defame the 19-year-old grandmaster and
ruin his career.

It's the latest move in a scandal that has injected unprecedented levels of
drama into the world of elite chess since early September, when Carlsen
suggested Niemann's upset victory over him at the Sinquefield Cup tournament
in St. Louis was the result of cheating.

Niemann wants a federal court in Missouri's eastern district to award him at
least $100 million in damages. Defendants in the lawsuit include Carlsen, his
company Play Magnus Group, the online platform Chess.com and its leader, Danny
Rensch, along with grandmaster Hikaru Nakamura.
"""

In [8]:
summarizer(text)

[{'summary_text': ' Chess grandmaster Hans Niemann files federal lawsuit against Magnus Carlsen . He accuses Carlsen of colluding with others to defame the 19-year-old grandmaster . Defendants in the lawsuit include Carlsen, Play Magnus Group, the online platform Chess.com and its leader, Danny Rensch .'}]

In [9]:
qa = pipeline("question-answering")

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [10]:
context="""
Hugging Face was founded in 2016 by Clément Delangue, Julien Chaumond, and
Thomas Wolf originally as a company that developed a chatbot app targeted at
teenagers.[2] After open-sourcing the model behind the chatbot, the company
pivoted to focus on being a platform for democratizing machine learning. In March
2021, Hugging Face raised $40 million in a Series B funding round.
"""

question = "Who are the Hugging Face founders?"

qa(question=question, context=context)

{'score': 0.9919217228889465,
 'start': 37,
 'end': 88,
 'answer': 'Clément Delangue, Julien Chaumond, and \nThomas Wolf'}

In [11]:
question = "What does Hugging Face do?"
qa(question=question, context=context)

{'score': 0.08730549365282059,
 'start': 118,
 'end': 164,
 'answer': 'developed a chatbot app targeted at \nteenagers'}

In [12]:
ner = pipeline(model="dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


In [13]:
text = "Panic ensues in Redmond as love child of Microsoft and OpenAI declares humanity obsolete."
ner(text)

[{'entity': 'B-PER',
  'score': 0.9993875,
  'index': 6,
  'word': 'Red',
  'start': 16,
  'end': 19},
 {'entity': 'I-PER',
  'score': 0.8049689,
  'index': 7,
  'word': '##mond',
  'start': 19,
  'end': 23},
 {'entity': 'B-ORG',
  'score': 0.9980654,
  'index': 12,
  'word': 'Microsoft',
  'start': 41,
  'end': 50},
 {'entity': 'B-ORG',
  'score': 0.9985505,
  'index': 14,
  'word': 'Open',
  'start': 55,
  'end': 59},
 {'entity': 'I-ORG',
  'score': 0.98842865,
  'index': 15,
  'word': '##A',
  'start': 59,
  'end': 60},
 {'entity': 'I-ORG',
  'score': 0.9739822,
  'index': 16,
  'word': '##I',
  'start': 60,
  'end': 61}]

In [14]:
data = load_dataset("squad")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [15]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [16]:
pd.DataFrame(data['train'][0, 1, 2, 100, 101, 102],
             columns=["context", "question", "answers"])

Unnamed: 0,context,question,answers
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,One of the main driving forces in the growth o...,In what year did the team lead by Knute Rockne...,"{'text': ['1925'], 'answer_start': [354]}"
4,One of the main driving forces in the growth o...,How many years was Knute Rockne head coach at ...,"{'text': ['13'], 'answer_start': [251]}"
5,One of the main driving forces in the growth o...,How many national titles were won when Knute R...,"{'text': ['three'], 'answer_start': [274]}"


In [17]:
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
t = "Where can I find a pizzeria?"
print(tokenizer.encode(t))

[0, 13841, 64, 38, 465, 10, 26432, 6971, 116, 2]


In [19]:
encoded_t = tokenizer(t)
print(encoded_t)

{'input_ids': [0, 13841, 64, 38, 465, 10, 26432, 6971, 116, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [20]:
print(tokenizer.convert_ids_to_tokens(encoded_t['input_ids']))

['<s>', 'Where', 'Ġcan', 'ĠI', 'Ġfind', 'Ġa', 'Ġpizz', 'eria', '?', '</s>']


In [21]:
encoded_pair = tokenizer("this is a question", "this is the context")
print(encoded_pair)

{'input_ids': [0, 9226, 16, 10, 864, 2, 2, 9226, 16, 5, 5377, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [22]:
print(tokenizer.convert_ids_to_tokens(encoded_pair['input_ids']))

['<s>', 'this', 'Ġis', 'Ġa', 'Ġquestion', '</s>', '</s>', 'this', 'Ġis', 'Ġthe', 'Ġcontext', '</s>']


In [23]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [24]:
context = "Sarah went to The Mirthless Cafe last night to meet her friend."
question = "Where did Sarah go?"

# The answer span and the answer's starting character position in the context.
answer = "The Mirthless Cafe"
answer_start = 14

In [25]:
x = tokenizer(question, context)
x

{'input_ids': [0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 1672, 16542, 94, 363, 7, 972, 69, 1441, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [26]:
tokenizer.batch_decode(x['input_ids'])

['<s>',
 'Where',
 ' did',
 ' Sarah',
 ' go',
 '?',
 '</s>',
 '</s>',
 'Sarah',
 ' went',
 ' to',
 ' The',
 ' M',
 'irth',
 'less',
 ' Cafe',
 ' last',
 ' night',
 ' to',
 ' meet',
 ' her',
 ' friend',
 '.',
 '</s>']

In [27]:
example_max_length = 15
x = tokenizer(question, context, max_length=example_max_length,
              truncation="only_second")
x

{'input_ids': [0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
tokenizer.batch_decode(x['input_ids'])

['<s>',
 'Where',
 ' did',
 ' Sarah',
 ' go',
 '?',
 '</s>',
 '</s>',
 'Sarah',
 ' went',
 ' to',
 ' The',
 ' M',
 'irth',
 '</s>']

In [29]:
x = tokenizer(question, context, max_length=example_max_length,
              truncation="only_second", return_overflowing_tokens=True,
              padding="max_length")
x

{'input_ids': [[0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 1672, 16542, 94, 363, 7, 972, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 69, 1441, 4, 2, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]], 'overflow_to_sample_mapping': [0, 0, 0]}

In [30]:
len(x['input_ids'])

3

In [31]:
tokenizer.batch_decode(x['input_ids'])

['<s>Where did Sarah go?</s></s>Sarah went to The Mirth</s>',
 '<s>Where did Sarah go?</s></s>less Cafe last night to meet</s>',
 '<s>Where did Sarah go?</s></s> her friend.</s><pad><pad><pad>']

In [32]:
tokenizer(['question 1', 'question 2'],
          ['context 1', 'context 2'],
          return_overflowing_tokens=True)

{'input_ids': [[0, 40018, 112, 2, 2, 46796, 112, 2], [0, 40018, 132, 2, 2, 46796, 132, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 1]}

In [33]:
stride = 5
x = tokenizer(question, context, max_length=example_max_length,
              truncation="only_second", return_overflowing_tokens=True,
              stride=stride, padding="max_length")

In [34]:
tokenizer.batch_decode(x['input_ids'])

['<s>Where did Sarah go?</s></s>Sarah went to The Mirth</s>',
 '<s>Where did Sarah go?</s></s> went to The Mirthless</s>',
 '<s>Where did Sarah go?</s></s> to The Mirthless Cafe</s>',
 '<s>Where did Sarah go?</s></s> The Mirthless Cafe last</s>',
 '<s>Where did Sarah go?</s></s> Mirthless Cafe last night</s>',
 '<s>Where did Sarah go?</s></s>irthless Cafe last night to</s>',
 '<s>Where did Sarah go?</s></s>less Cafe last night to meet</s>',
 '<s>Where did Sarah go?</s></s> Cafe last night to meet her</s>',
 '<s>Where did Sarah go?</s></s> last night to meet her friend</s>',
 '<s>Where did Sarah go?</s></s> night to meet her friend.</s>']

In [35]:
print(x.keys(), '\n')
x

KeysView({'input_ids': [[0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 439, 7, 20, 256, 24208, 1672, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 7, 20, 256, 24208, 1672, 16542, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 20, 256, 24208, 1672, 16542, 94, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 256, 24208, 1672, 16542, 94, 363, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 24208, 1672, 16542, 94, 363, 7, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 1672, 16542, 94, 363, 7, 972, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 16542, 94, 363, 7, 972, 69, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 94, 363, 7, 972, 69, 1441, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 363, 7, 972, 69, 1441, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

{'input_ids': [[0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 439, 7, 20, 256, 24208, 1672, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 7, 20, 256, 24208, 1672, 16542, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 20, 256, 24208, 1672, 16542, 94, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 256, 24208, 1672, 16542, 94, 363, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 24208, 1672, 16542, 94, 363, 7, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 1672, 16542, 94, 363, 7, 972, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 16542, 94, 363, 7, 972, 69, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 94, 363, 7, 972, 69, 1441, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 363, 7, 972, 69, 1441, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 

In [36]:
print(answer_start)
print(context[answer_start:answer_start+len(answer)])

14
The Mirthless Cafe


In [37]:
x = tokenizer(question, context, max_length=example_max_length,
              truncation="only_second", return_overflowing_tokens=True,
              stride=stride, return_offsets_mapping=True,
              padding="max_length")
x

{'input_ids': [[0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 439, 7, 20, 256, 24208, 1672, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 7, 20, 256, 24208, 1672, 16542, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 20, 256, 24208, 1672, 16542, 94, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 256, 24208, 1672, 16542, 94, 363, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 24208, 1672, 16542, 94, 363, 7, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 1672, 16542, 94, 363, 7, 972, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 16542, 94, 363, 7, 972, 69, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 94, 363, 7, 972, 69, 1441, 2], [0, 13841, 222, 4143, 213, 116, 2, 2, 363, 7, 972, 69, 1441, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 

In [38]:
print(len(x['input_ids']))
print(len(x['offset_mapping']))

10
10


In [39]:
print(x['input_ids'][0])
print(x['offset_mapping'][0])

[0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 2]
[(0, 0), (0, 5), (6, 9), (10, 15), (16, 18), (18, 19), (0, 0), (0, 0), (0, 5), (6, 10), (11, 13), (14, 17), (18, 19), (19, 23), (0, 0)]


In [40]:
print("First non-special input_id converted to token:")
print(tokenizer.convert_ids_to_tokens(x['input_ids'][0][1]), "\n")

offset = x['offset_mapping'][0][1]
print(f"Span extracted from context using corresponding offset_mapping {offset}:")
print(question[offset[0]:offset[1]])

First non-special input_id converted to token:
Where 

Span extracted from context using corresponding offset_mapping (0, 5):
Where


In [41]:
print(x['offset_mapping'][0])
print(x['offset_mapping'][1])

[(0, 0), (0, 5), (6, 9), (10, 15), (16, 18), (18, 19), (0, 0), (0, 0), (0, 5), (6, 10), (11, 13), (14, 17), (18, 19), (19, 23), (0, 0)]
[(0, 0), (0, 5), (6, 9), (10, 15), (16, 18), (18, 19), (0, 0), (0, 0), (6, 10), (11, 13), (14, 17), (18, 19), (19, 23), (23, 27), (0, 0)]


In [42]:
print(x['input_ids'][0])
print(x.sequence_ids(0))

[0, 13841, 222, 4143, 213, 116, 2, 2, 33671, 439, 7, 20, 256, 24208, 2]
[None, 0, 0, 0, 0, 0, None, None, 1, 1, 1, 1, 1, 1, None]


In [43]:
answer_end = answer_start + len(answer)

print("Answer start character position:", answer_start)
print("Answer end character position:", answer_end)
print("Answer pulled from context:", context[answer_start:answer_end])

Answer start character position: 14
Answer end character position: 32
Answer pulled from context: The Mirthless Cafe


In [44]:
tokenizer.batch_decode(x['input_ids'])

['<s>Where did Sarah go?</s></s>Sarah went to The Mirth</s>',
 '<s>Where did Sarah go?</s></s> went to The Mirthless</s>',
 '<s>Where did Sarah go?</s></s> to The Mirthless Cafe</s>',
 '<s>Where did Sarah go?</s></s> The Mirthless Cafe last</s>',
 '<s>Where did Sarah go?</s></s> Mirthless Cafe last night</s>',
 '<s>Where did Sarah go?</s></s>irthless Cafe last night to</s>',
 '<s>Where did Sarah go?</s></s>less Cafe last night to meet</s>',
 '<s>Where did Sarah go?</s></s> Cafe last night to meet her</s>',
 '<s>Where did Sarah go?</s></s> last night to meet her friend</s>',
 '<s>Where did Sarah go?</s></s> night to meet her friend.</s>']

In [45]:
input_ids = x['input_ids'][0]
offset_mapping = x['offset_mapping'][0]
seq_ids = x.sequence_ids(0)

In [46]:
# These are the sequence ids
print("Sequence IDs: ", seq_ids)

Sequence IDs:  [None, 0, 0, 0, 0, 0, None, None, 1, 1, 1, 1, 1, 1, None]


In [47]:
# Get the start index position (i.e. the first occurrence of 1).
context_pos_start = seq_ids.index(1)

In [48]:
def rindex(lst, value):
    return len(lst) - operator.indexOf(reversed(lst), value) - 1

# Get the end index position (i.e. the last occurrence of 1).
context_pos_end = rindex(seq_ids, 1)

In [49]:
print("Context tokens begin at position", context_pos_start)
print("Context tokens end at position", context_pos_end)

Context tokens begin at position 8
Context tokens end at position 13


In [50]:
context_offsets = offset_mapping[context_pos_start:context_pos_end+1]
print(context_offsets)

[(0, 5), (6, 10), (11, 13), (14, 17), (18, 19), (19, 23)]


In [51]:
print("Is the lowest offset value lower than or equal to the starting character position?")
print("Answer starting character position:", answer_start)
print("First offset:", context_offsets[0])

# Note how we're checking the first tuple value.
print(context_offsets[0][0] <= answer_start)

Is the lowest offset value lower than or equal to the starting character position?
Answer starting character position: 14
First offset: (0, 5)
True


In [52]:
print("Is the highest offset value higher than or equal to the ending character position?")
print("Answer ending character position:", answer_end)
print("Last offset:", context_offsets[-1])

# Note how how we're checking the second tuple value.
print(context_offsets[-1][1] >= answer_end)

Is the highest offset value higher than or equal to the ending character position?
Answer ending character position: 32
Last offset: (19, 23)
False


In [53]:
print(tokenizer.batch_decode(input_ids))

['<s>', 'Where', ' did', ' Sarah', ' go', '?', '</s>', '</s>', 'Sarah', ' went', ' to', ' The', ' M', 'irth', '</s>']


In [54]:
input_ids = x['input_ids'][2]
offset_mapping = x['offset_mapping'][2]
seq_ids = x.sequence_ids(2)

context_pos_start = seq_ids.index(1)
context_pos_end = rindex(seq_ids, 1)

context_offsets = offset_mapping[context_pos_start:context_pos_end+1]

print("Is the lowest offset value lower than or equal to the starting character position?")
print("Answer starting character position:", answer_start)
print("First offset:", context_offsets[0])

# Note how we're checking the first tuple value.
print(context_offsets[0][0] <= answer_start)

print("Is the highest offset value higher than or equal to the ending character position?")
print("Answer ending character position:", answer_end)
print("Last offset:", context_offsets[-1])

# Note how how we're checking the second tuple value.
print(context_offsets[-1][1] >= answer_end)


Is the lowest offset value lower than or equal to the starting character position?
Answer starting character position: 14
First offset: (11, 13)
True
Is the highest offset value higher than or equal to the ending character position?
Answer ending character position: 32
Last offset: (28, 32)
True


In [55]:
s = e = 0

# Start scanning the offset_mapping from the
# left to find the token position where the answer starts.
# It's not guaranteed a tokenizer will output a token where the
# starting character matches the first answer character. When
# this happens, we take the previous token's position as our start.
i = context_pos_start
while offset_mapping[i][0] < answer_start:
  i += 1
if offset_mapping[i][0] == answer_start:
  s = i
else:
  s = i - 1

# Same idea when finding the ending token position.
j = context_pos_end
while offset_mapping[j][1] > answer_end:
  j -= 1
if offset_mapping[j][1] == answer_end:
  e = j
else:
  e = j + 1

In [56]:
print("Answer start token position in context:", s)
print("Answer end token position in context:", e)


Answer start token position in context: 9
Answer end token position in context: 13


In [57]:
print("Answer lifted from context:")
tokenizer.batch_decode(input_ids[s:e+1])

Answer lifted from context:


[' The', ' M', 'irth', 'less', ' Cafe']

In [58]:
def prepare_dataset(examples):
  # Some tokenizers don't strip spaces. If there happens to be question text
  # with excessive spaces, the context may not get encoded at all.
  examples["question"] = [q.lstrip() for q in examples["question"]]
  examples["context"] = [c.lstrip() for c in examples["context"]]

  # Tokenize.
  tokenized_examples = tokenizer(
      examples['question'],
      examples['context'],
      truncation="only_second",
      max_length = max_length,
      stride=stride,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding="max_length"
  )

  # We'll collect a list of starting positions and ending positions.
  tokenized_examples['start_positions'] = []
  tokenized_examples['end_positions'] = []

  # Work through every sequence.
  for seq_idx in range(len(tokenized_examples['input_ids'])):
    seq_ids = tokenized_examples.sequence_ids(seq_idx)
    offset_mappings = tokenized_examples['offset_mapping'][seq_idx]

    cur_example_idx = tokenized_examples['overflow_to_sample_mapping'][seq_idx]
    answer = examples['answers'][cur_example_idx]
    answer_text = answer['text'][0]
    answer_start = answer['answer_start'][0]
    answer_end = answer_start + len(answer_text)

    context_pos_start = seq_ids.index(1)
    context_pos_end = rindex(seq_ids, 1)

    s = e = 0
    if (offset_mappings[context_pos_start][0] <= answer_start and
        offset_mappings[context_pos_end][1] >= answer_end):
      i = context_pos_start
      while offset_mappings[i][0] < answer_start:
        i += 1
      if offset_mappings[i][0] == answer_start:
        s = i
      else:
        s = i - 1

      j = context_pos_end
      while offset_mappings[j][1] > answer_end:
        j -= 1
      if offset_mappings[j][1] == answer_end:
        e = j
      else:
        e = j + 1

    tokenized_examples['start_positions'].append(s)
    tokenized_examples['end_positions'].append(e)

  return tokenized_examples

In [59]:
max_length = 400
stride = 100
batch_size = 32

In [60]:
tokenized_datasets = data.map(
  prepare_dataset,
  batched=True,
  remove_columns=data["train"].column_names,
  num_proc=2,
)

Map (num_proc=2):   0%|          | 0/87599 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/10570 [00:00<?, ? examples/s]

In [62]:
data = tokenized_datasets.remove_columns(["offset_mapping",
                                          "overflow_to_sample_mapping"])

In [63]:
train_set = data['train'].to_tf_dataset(batch_size=batch_size)
validation_set = data['validation'].to_tf_dataset(batch_size=batch_size)

In [64]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFRobertaForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFRobertaForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
def get_answer(tokenizer, model, question, context):
  inputs = tokenizer([question], [context], return_tensors="np")
  outputs = model(inputs)
  start_position = tf.argmax(outputs.start_logits, axis=1)
  end_position = tf.argmax(outputs.end_logits, axis=1)
  answer = inputs["input_ids"][0, int(start_position) : int(end_position) + 1]
  return tokenizer.decode(answer).strip()

In [66]:
c = "Sarah went to The Mirthless Cafe last night to meet her friend."
q = "Where did Sarah go?"
get_answer(tokenizer, model, q, c)

''

In [67]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=3e-5))

In [68]:
model.fit(train_set, validation_data=validation_set, epochs=1)



<tf_keras.src.callbacks.History at 0x78268627e2d0>

In [69]:
c = "Sarah went to The Mirthless Cafe last night to meet her friend."
q = "Where did Sarah go?"
get_answer(tokenizer, model, q, c)

'The Mirthless Cafe'

In [70]:
q = "Who did Sarah meet?"
get_answer(tokenizer, model, q, c)

'her friend'

In [71]:
q = "When did Sarah meet her friend?"
get_answer(tokenizer, model, q, c)

'last night'

In [72]:
q = "Who went to the restaurant?"
get_answer(tokenizer, model, q, c)

'Sarah'

In [73]:
# Asking a logic teaser question is difficult despite the
# answer being available. To be fair, there is ambiguity here.
q = "Who did Sarah's friend meet?"
get_answer(tokenizer, model, q, c)

'her friend'

In [74]:
# The model can't determine when a question can't be
# answered. Some question answering datasets explicitly
# train for this.
q = "How did Sarah get to the restaurant?"
get_answer(tokenizer, model, q, c)

'to meet her friend'

In [75]:
# The model isn't generative, either.
q = "What is a possible reason for why Sarah met her friend?"
get_answer(tokenizer, model, q, c)

'<s>'