In [1]:
!pip install -U pip transformers



In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
checkpoint = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
print(f"{len(tokenizer.vocab)}\n")

tokenizer.vocab

256204



{'▁soldado': 165595,
 'тельно': 28406,
 'ENA': 124533,
 '▁ಮತ್ತೆ': 82305,
 '▁politici': 128553,
 '▁zendo': 225242,
 '▁ኣረኣ': 214815,
 'ച്ച്': 17638,
 'कर्ष': 123358,
 '▁Night': 51704,
 '▁خمسة': 206483,
 '▁אג': 142329,
 '▁Dawn': 52290,
 'די': 4061,
 'することができます': 43064,
 'ïñ': 158766,
 'дзіў': 242023,
 'acció': 211322,
 '▁അഞ്ച': 187222,
 'okazzjoni': 183551,
 '▁kuko': 14002,
 'deli': 32295,
 '▁አይነት': 109336,
 '▁մահ': 68820,
 'simula': 111535,
 'ວກ': 6998,
 '▁ደስ': 99576,
 '▁эд': 144875,
 'స్ప': 145835,
 '▁conse': 10899,
 '▁olukord': 211131,
 '▁informasjonen': 172910,
 '▁damai': 219786,
 '▁message': 54764,
 '▁okulia': 132715,
 'يتون': 136467,
 '▁দলে': 203324,
 '៕': 252079,
 'プログラ': 136976,
 '▁അവിടെ': 52544,
 'ակալ': 73943,
 'كىلى': 217901,
 '▁موتور': 111674,
 'agii': 148229,
 '▁jne': 209977,
 '▁알렉': 195925,
 '▁తెలిపారు': 233916,
 '▁especies': 88770,
 '▁ਪਿਤਾ': 122245,
 '▁inovatif': 200211,
 '▁सीख': 65375,
 'ドライ': 70262,
 '▁дії': 120644,
 '▁agenci': 158445,
 '道を': 82212,
 '▁mpimo': 123183,
 'h

In [5]:
thai_char_min = 0x0E00
thai_char_max = 0x0E7F

thai_tokens = [
    token for token in tokenizer.vocab.keys()
    if any(thai_char_min <= ord(char) <= thai_char_max for char in token)
]

thai_token_count = len(thai_tokens)
sample_size = 20
thai_tokens_sample = thai_tokens[:sample_size]


print(f"{thai_token_count}\n")
for token in thai_tokens_sample:
  print(token)

1712

▁ดี
วน
นึก
▁ทํา
ใหญ
บน
เมือง
เรื่อง
▁ด
▁ออก
เด็ก
ใหญ่
ส่ง
บ
โช
▁ด้วย
ถ
ม่
เท้า
อัลลอฮ์


In [6]:
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import math

In [7]:
sentence = 'Work hard, play harder'

In [8]:
cleaned_sentence = sentence.replace(',', '')
cleaned_sentence

'Work hard play harder'

In [9]:
words = cleaned_sentence.split()
words

['Work', 'hard', 'play', 'harder']

In [10]:
sorted_words = sorted(words)
sorted_words

['Work', 'hard', 'harder', 'play']

In [11]:
dc = {word: index for index, word in enumerate(sorted_words)}
dc

{'Work': 0, 'hard': 1, 'harder': 2, 'play': 3}

In [12]:
sentence_int = tf.constant(
    [dc[s] for s in sentence.replace(',', '').split()],
    dtype=tf.int32
)

In [13]:
print(sentence)
print(sentence_int)

Work hard, play harder
tf.Tensor([0 1 3 2], shape=(4,), dtype=int32)


In [14]:
# สร้าง embedding layer
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

In [15]:
embedded_sentence = embed(sentence_int)

In [16]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.01007104,  0.0394769 ],
       [ 0.04794415, -0.03628337],
       [-0.03208207, -0.00645279],
       [-0.04982368, -0.04841751]], dtype=float32)>

In [17]:
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

dummy_input = tf.constant([0, 1, 2], dtype=tf.int32)

# Case 1 Default initializer (RandomUniform(-0.05, 0.05))
embed_default = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
_ = embed_default(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_default = embed_default.get_weights()[0].flatten()
weights_default.shape

(100000,)

In [18]:
# Case 2 GlorotUniform initializer
tf.random.set_seed(123)
embed_glorot = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.GlorotUniform()
)
_ = embed_glorot(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_glorot = embed_glorot.get_weights()[0].flatten()
weights_glorot.shape

(100000,)

In [19]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=weights_default, nbinsx=50, name="Default Uniform [-0.05, 0.05]", opacity=0.6))
fig.add_trace(go.Histogram(x=weights_glorot, nbinsx=50, name="Glorot Uniform", opacity=0.6))

fig.update_layout(
    title_text='Embedding Layer Initialization Comparison',
    xaxis_title_text='Weight values',
    yaxis_title_text='Frequency',
    barmode='overlay',
    legend_orientation="h",
    legend_yanchor="bottom",
    legend_y=1.02,
    legend_xanchor="right",
    legend_x=1
)

fig.show()

print("Default initializer range ", weights_default.min(), weights_default.max())
print("Glorot initializer range ", weights_glorot.min(), weights_glorot.max())

Default initializer range  -0.04999564 0.049998973
Glorot initializer range  -0.010953984 0.010953848


In [20]:
def glorot_uniform_limits(fan_in, fan_out):
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    a, b = -limit, limit
    return a, b

# ตัวอย่าง Embedding layer (vocab_size=50000, embedding_dim=2)
fan_in = 50000
fan_out = 2

a, b = glorot_uniform_limits(fan_in, fan_out)
print("Glorot Uniform a =", a)
print("Glorot Uniform b =", b)

Glorot Uniform a = -0.010954232067652772
Glorot Uniform b = 0.010954232067652772


In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [22]:
token_embedding_layer = model.model.encoder.embed_tokens
token_embedding_layer.weight.shape

torch.Size([256206, 1024])

In [23]:
long_sentence = "In the vast realm of natural language processing, understanding the nuances of how models handle sequential data is crucial. Positional encoding plays a vital role in providing this essential information to the model, allowing it to differentiate between words at different positions in a sentence, which is fundamental for tasks like translation, summarization, and text generation."

In [24]:
tokens = tokenizer(long_sentence, return_tensors="pt")

print(tokens['input_ids'][0])

tensor([256047,    717,    349,  14430,  12284, 248070,    452,  25307,  65445,
        157278, 248079, 133930,    349,    713,  75831,    452,  11657, 141057,
         47274, 116914, 124785,   6067,    248, 182071, 248075,  12013,  58409,
         12025, 246156,   3054,    705,      9, 104781,  76065,    108, 174693,
          3423, 140515,  18781,    202,    349,  14916, 248079,  82935,     87,
           796,    202,  53054,    502,  25914,  51744,    230,  30158, 199073,
           108,      9, 109267, 248079,   9089,    248,  75529,    351, 226047,
          6399, 200356, 248079,   2493, 109207, 181953, 248079,    540,  35883,
        120531, 248075,      2])


In [25]:
len(tokens['input_ids'][0])

75

In [26]:
token_embedding_layer(tokens['input_ids'][0][0]).shape

torch.Size([1024])

In [27]:
token_embeddings = token_embedding_layer(tokens['input_ids'][0])

print("Token Embedding Matrix shape", token_embeddings.shape)
token_embeddings

Token Embedding Matrix shape torch.Size([75, 1024])


tensor([[-5.0000e+00, -1.2725e+00, -9.3604e-01,  ..., -1.8297e+01,
         -9.1328e+00, -1.0672e+01],
        [ 2.6416e-01,  2.6831e-01,  2.0117e-01,  ...,  3.2715e+00,
         -3.2402e+00,  3.1738e+00],
        [ 4.3579e-01, -2.3352e-01,  2.6825e-02,  ...,  5.4648e+00,
          2.7129e+00,  5.5430e+00],
        ...,
        [ 8.5859e+00, -4.5391e+00, -4.7314e-01,  ..., -7.9529e-02,
          7.4844e+00, -7.5156e+00],
        [-2.4863e+00, -2.7515e-01,  5.6114e-03,  ...,  1.0180e+01,
         -7.2422e+00, -4.8047e+00],
        [-7.8320e-01, -9.0527e-01, -9.4482e-01,  ...,  3.1078e+01,
         -8.1494e-01, -8.7354e-01]], grad_fn=<MulBackward0>)

In [28]:
import plotly.express as px

token_embeddings_np = token_embeddings.detach().numpy()

fig = px.imshow(
    token_embeddings_np,
    color_continuous_scale="RdBu",
    labels=dict(x="Embedding Dimension", y="Token Index", color="Value"),
    title="Token Embedding Heatmap"
)

fig.update_xaxes(side="top")
fig.update_layout(height=500, width=900)
fig.show()

In [29]:
d = embedded_sentence.shape[-1]
d

2

In [30]:
d_q, d_k, d_v = 2, 2, 4

d_q, d_k, d_v

(2, 2, 4)

In [31]:
tf.random.set_seed(123)
W_query = tf.Variable(tf.random.uniform((d, d_q)), trainable=True)
W_key   = tf.Variable(tf.random.uniform((d, d_k)), trainable=True)
W_value = tf.Variable(tf.random.uniform((d, d_v)), trainable=True)

In [32]:
print(W_query.shape, W_key.shape, W_value.shape)

(2, 2) (2, 2) (2, 4)


In [33]:
W_query

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.12615311, 0.5727513 ],
       [0.2993133 , 0.5461836 ]], dtype=float32)>

In [34]:
W_key

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.88968754, 0.12354946],
       [0.7718717 , 0.6850728 ]], dtype=float32)>

In [35]:
W_value

<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
array([[0.48962688, 0.5857923 , 0.36451697, 0.6550509 ],
       [0.9075084 , 0.37557673, 0.6882372 , 0.25384045]], dtype=float32)>

In [36]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.01007104,  0.0394769 ],
       [ 0.04794415, -0.03628337],
       [-0.03208207, -0.00645279],
       [-0.04982368, -0.04841751]], dtype=float32)>

In [37]:
queries = tf.matmul(embedded_sentence, W_query)
keys    = tf.matmul(embedded_sentence, W_key)
values  = tf.matmul(embedded_sentence, W_value)

In [38]:
print("Queries shape", queries.shape)
queries

Queries shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.01308645,  0.02732983],
       [-0.00481179,  0.00764269],
       [-0.00597866, -0.02189945],
       [-0.02077742, -0.05498143]], dtype=float32)>

In [39]:
print("Keys shape", keys.shape)
keys

Keys shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.03943118,  0.02828882],
       [ 0.01464921, -0.01893328],
       [-0.03352374, -0.00838435],
       [-0.08169961, -0.0393252 ]], dtype=float32)>

In [40]:
print("Values shape", values.shape)
values

Values shape (4, 4)


<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[ 0.04075667,  0.02072614,  0.03084053,  0.01661788],
       [-0.00945272,  0.01445813, -0.00749511,  0.02219567],
       [-0.0215642 , -0.02121695, -0.01613551, -0.02265337],
       [-0.06833431, -0.04737082, -0.05148431, -0.04492737]],
      dtype=float32)>

In [41]:
omega = tf.matmul(queries, keys, transpose_b=True)

print("Omega shape", omega.shape)
print("Omega (Unnormalized attention weights)")
print(omega)

Omega shape (4, 4)
Omega (Unnormalized attention weights)
tf.Tensor(
[[ 1.2891430e-03 -3.2573717e-04 -6.6784985e-04 -2.1439092e-03]
 [ 2.6468100e-05 -2.1519014e-04  9.7230259e-05  9.2571107e-05]
 [-8.5525523e-04  3.2704585e-04  3.8403971e-04  1.3496545e-03]
 [-2.3746374e-03  7.3660596e-04  1.1575203e-03  3.8596625e-03]], shape=(4, 4), dtype=float32)


In [42]:
d_k = tf.cast(d_k, tf.float32)

scaled_omega = omega / tf.sqrt(d_k)

attention_weights = tf.nn.softmax(scaled_omega, axis=-1)

print("Attention Weights")
print(attention_weights)

Attention Weights
tf.Tensor(
[[0.25030968 0.250024   0.24996354 0.24970277]
 [0.25000465 0.24996191 0.25001714 0.25001633]
 [0.24979559 0.25000453 0.25001457 0.25018534]
 [0.24943122 0.24998057 0.250055   0.25053322]], shape=(4, 4), dtype=float32)


In [43]:
row_sums = tf.reduce_sum(attention_weights, axis=-1)

print("Sum of each row in attention_weights")
row_sums

Sum of each row in attention_weights


<tf.Tensor: shape=(4,), dtype=float32, numpy=array([1., 1., 1., 1.], dtype=float32)>

In [44]:
context_vector = tf.matmul(attention_weights, values)

print("Context Vector shape", context_vector.shape)
print(context_vector)

Context Vector shape (4, 4)
tf.Tensor(
[[-0.01461515 -0.00832926 -0.01104334 -0.00717194]
 [-0.01464958 -0.00835247 -0.01106929 -0.00719369]
 [-0.01467    -0.00836413 -0.01108471 -0.00720375]
 [-0.01470926 -0.00838937 -0.01111433 -0.00722688]], shape=(4, 4), dtype=float32)


In [45]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super().__init__()
        self.d_out_kq = d_out_kq

        self.W_query = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_key = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_value = tf.Variable(
            tf.random.uniform((d_in, d_out_v)), trainable=True
        )

    def call(self, x):
        keys = tf.matmul(x, self.W_key)      # [T, d_out_kq]
        queries = tf.matmul(x, self.W_query) # [T, d_out_kq]
        values = tf.matmul(x, self.W_value)  # [T, d_out_v]

        # Attention scores: QKᵀ
        attn_scores = tf.matmul(queries, keys, transpose_b=True)  # [T, T]

        # Softmax (scaled by sqrt(d_k))
        attn_weights = tf.nn.softmax(
            attn_scores / tf.math.sqrt(tf.cast(self.d_out_kq, tf.float32)), axis=-1
        )  # [T, T]

        # Weighted sum
        context_vec = tf.matmul(attn_weights, values)  # [T, d_out_v]
        return context_vec

In [46]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 4

sa = SelfAttention(d_in, d_out_kq, d_out_v)

out = sa(embedded_sentence)

print(out.shape)  # (T, d_out_v)
print(out.numpy())

(4, 4)
[[-0.01461515 -0.00832926 -0.01104334 -0.00717194]
 [-0.01464958 -0.00835247 -0.01106929 -0.00719369]
 [-0.01467    -0.00836413 -0.01108471 -0.00720375]
 [-0.01470926 -0.00838937 -0.01111433 -0.00722688]]


In [47]:
class MultiHeadAttentionWrapper(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v, num_heads):
        super().__init__()
        self.heads = [
            SelfAttention(d_in, d_out_kq, d_out_v)
            for _ in range(num_heads)
        ]

    def call(self, x):
        # รันทุก head แล้ว concat ตามแกนสุดท้าย
        head_outputs = [head(x) for head in self.heads]   # list of [T, d_out_v]
        return tf.concat(head_outputs, axis=-1)           # [T, num_heads * d_out_v]

In [48]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 1

sa = SelfAttention(d_in, d_out_kq, d_out_v)

# ถ้า embedded_sentence.shape = [T, d_in] เช่น [6, 3]
out = sa(embedded_sentence)

print(out.shape)   # (T, d_out_v) -> (6, 1)
print(out.numpy())

(4, 1)
[[-0.01046719]
 [-0.01049348]
 [-0.01050812]
 [-0.01053753]]


In [49]:
tf.random.set_seed(123)

block_size = embedded_sentence.shape[0]   # [T, d_in] → T = sequence length

mha = MultiHeadAttentionWrapper(
    d_in, d_out_kq, d_out_v, num_heads=3
)

# run MHA
context_vecs = mha(embedded_sentence)   # [T, num_heads * d_out_v]

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tf.Tensor(
[[-0.01046719 -0.01332432 -0.01625359]
 [-0.01049348 -0.01334015 -0.01622624]
 [-0.01050812 -0.01337465 -0.01631479]
 [-0.01053753 -0.01340766 -0.01634591]], shape=(4, 3), dtype=float32)
context_vecs.shape: (4, 3)
