In [19]:
!pip install -U pip transformers



In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [21]:
checkpoint = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [22]:
print(f"{len(tokenizer.vocab)}\n")

tokenizer.vocab

256204



{'▁lainim': 198481,
 '▁goga': 225520,
 'itant': 159349,
 'ళి': 208247,
 '▁участие': 152163,
 'Hebere': 130236,
 'रख': 180758,
 '▁ull': 142279,
 '▁diadakan': 219701,
 '涅': 255422,
 'tq': 162746,
 '▁fundamentali': 214622,
 '▁Meie': 96781,
 'egley': 116768,
 '▁níbẹ': 221113,
 '▁lelo': 22945,
 'aboa': 227976,
 '्वल': 213361,
 '▁بە': 2098,
 '얀': 253797,
 '▁கேள்வி': 70987,
 '▁ivaka': 127636,
 '▁амби': 226938,
 'apren': 212280,
 'liegen': 160453,
 '▁Luk': 24577,
 '▁měla': 78303,
 '▁Ciamar': 152348,
 'ថ': 249535,
 'fsi': 69405,
 '▁efallai': 100291,
 'ার্ড': 106988,
 '▁कच': 201075,
 '▁banned': 241573,
 '▁aköld': 246190,
 'ଇଁ': 215740,
 '▁ಆರು': 231575,
 '▁Engari': 101871,
 'شانی': 201207,
 '▁back': 11535,
 '▁ههههههههههههههه': 146685,
 'allam': 220343,
 '覚悟': 198747,
 '▁identi': 9256,
 '火狱': 119132,
 '▁eny': 22890,
 '▁קודם': 125463,
 'ରନ୍ତ': 184762,
 '▁еңеү': 175904,
 'સ્થિતિ': 128790,
 '▁エン': 198304,
 'ଡ଼ା': 96388,
 'inse': 95763,
 'ንኛውም': 150679,
 '分裂': 160990,
 'ີ່': 83428,
 '▁Tene': 165641,
 

In [23]:
thai_char_min = 0x0E00
thai_char_max = 0x0E7F

thai_tokens = [
    token for token in tokenizer.vocab.keys()
    if any(thai_char_min <= ord(char) <= thai_char_max for char in token)
]

thai_token_count = len(thai_tokens)
sample_size = 20
thai_tokens_sample = thai_tokens[:sample_size]


print(f"{thai_token_count}\n")
for token in thai_tokens_sample:
  print(token)


1712

แบบ
โทร
ตัว
ของข้า
ียบ
คิด
็ม
▁มา
ร้อง
เคร
ึ่ง
▁มี
หญิง
โรง
รูป
▁ปล
ู้
ด้
รอ
ไอ


In [24]:
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import math

In [25]:
sentence = 'Work hard, play harder'

In [26]:
cleaned_sentence = sentence.replace(',', '')
cleaned_sentence

'Work hard play harder'

In [27]:
words = cleaned_sentence.split()
words

['Work', 'hard', 'play', 'harder']

In [28]:
sorted_words = sorted(words)
sorted_words

['Work', 'hard', 'harder', 'play']

In [29]:
dc = {word: index for index, word in enumerate(sorted_words)}
dc

{'Work': 0, 'hard': 1, 'harder': 2, 'play': 3}

In [30]:
sentence_int = tf.constant(
    [dc[s] for s in sentence.replace(',', '').split()],
    dtype=tf.int32
)

In [31]:
print(sentence)
print(sentence_int)

Work hard, play harder
tf.Tensor([0 1 3 2], shape=(4,), dtype=int32)


In [32]:
# สร้าง embedding layer
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

In [33]:
embedded_sentence = embed(sentence_int)

In [18]:
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

dummy_input = tf.constant([0, 1, 2], dtype=tf.int32)

# Case 1 Default initializer (RandomUniform(-0.05, 0.05))
embed_default = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
_ = embed_default(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_default = embed_default.get_weights()[0].flatten()
weights_default.shape

(100000,)

In [34]:
# Case 2 GlorotUniform initializer
tf.random.set_seed(123)
embed_glorot = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.GlorotUniform()
)
_ = embed_glorot(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_glorot = embed_glorot.get_weights()[0].flatten()
weights_glorot.shape


(100000,)

In [35]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=weights_default, nbinsx=50, name="Default Uniform [-0.05, 0.05]", opacity=0.6))
fig.add_trace(go.Histogram(x=weights_glorot, nbinsx=50, name="Glorot Uniform", opacity=0.6))

fig.update_layout(
    title_text='Embedding Layer Initialization Comparison',
    xaxis_title_text='Weight values',
    yaxis_title_text='Frequency',
    barmode='overlay',
    legend_orientation="h",
    legend_yanchor="bottom",
    legend_y=1.02,
    legend_xanchor="right",
    legend_x=1
)

fig.show()

print("Default initializer range ", weights_default.min(), weights_default.max())
print("Glorot initializer range ", weights_glorot.min(), weights_glorot.max())

Default initializer range  -0.049998414 0.049998943
Glorot initializer range  -0.01095354 0.010953913


In [36]:
def glorot_uniform_limits(fan_in, fan_out):
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    a, b = -limit, limit
    return a, b

# ตัวอย่าง Embedding layer (vocab_size=50000, embedding_dim=2)
fan_in = 50000
fan_out = 2

a, b = glorot_uniform_limits(fan_in, fan_out)
print("Glorot Uniform a =", a)
print("Glorot Uniform b =", b)

Glorot Uniform a = -0.010954232067652772
Glorot Uniform b = 0.010954232067652772


In [37]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [38]:
token_embedding_layer = model.model.encoder.embed_tokens
token_embedding_layer.weight.shape


torch.Size([256206, 1024])

In [39]:
long_sentence = "In the vast realm of natural language processing, understanding the nuances of how models handle sequential data is crucial. Positional encoding plays a vital role in providing this essential information to the model, allowing it to differentiate between words at different positions in a sentence, which is fundamental for tasks like translation, summarization, and text generation."

In [40]:
tokens = tokenizer(long_sentence, return_tensors="pt")

print(tokens['input_ids'][0])

tensor([256047,    717,    349,  14430,  12284, 248070,    452,  25307,  65445,
        157278, 248079, 133930,    349,    713,  75831,    452,  11657, 141057,
         47274, 116914, 124785,   6067,    248, 182071, 248075,  12013,  58409,
         12025, 246156,   3054,    705,      9, 104781,  76065,    108, 174693,
          3423, 140515,  18781,    202,    349,  14916, 248079,  82935,     87,
           796,    202,  53054,    502,  25914,  51744,    230,  30158, 199073,
           108,      9, 109267, 248079,   9089,    248,  75529,    351, 226047,
          6399, 200356, 248079,   2493, 109207, 181953, 248079,    540,  35883,
        120531, 248075,      2])


In [41]:
len(tokens['input_ids'][0])

75

In [42]:
token_embedding_layer(tokens['input_ids'][0][0]).shape

torch.Size([1024])

In [44]:
token_embeddings = token_embedding_layer(tokens['input_ids'][0])

print("Token Embedding Matrix shape", token_embeddings.shape)
token_embeddings

Token Embedding Matrix shape torch.Size([75, 1024])


tensor([[-5.0000e+00, -1.2725e+00, -9.3604e-01,  ..., -1.8297e+01,
         -9.1328e+00, -1.0672e+01],
        [ 2.6416e-01,  2.6831e-01,  2.0117e-01,  ...,  3.2715e+00,
         -3.2402e+00,  3.1738e+00],
        [ 4.3579e-01, -2.3352e-01,  2.6825e-02,  ...,  5.4648e+00,
          2.7129e+00,  5.5430e+00],
        ...,
        [ 8.5859e+00, -4.5391e+00, -4.7314e-01,  ..., -7.9529e-02,
          7.4844e+00, -7.5156e+00],
        [-2.4863e+00, -2.7515e-01,  5.6114e-03,  ...,  1.0180e+01,
         -7.2422e+00, -4.8047e+00],
        [-7.8320e-01, -9.0527e-01, -9.4482e-01,  ...,  3.1078e+01,
         -8.1494e-01, -8.7354e-01]], grad_fn=<MulBackward0>)

In [45]:
import plotly.express as px

token_embeddings_np = token_embeddings.detach().numpy()

fig = px.imshow(
    token_embeddings_np,
    color_continuous_scale="RdBu",
    labels=dict(x="Embedding Dimension", y="Token Index", color="Value"),
    title="Token Embedding Heatmap"
)

fig.update_xaxes(side="top")
fig.update_layout(height=500, width=900)
fig.show()

In [46]:
d = embedded_sentence.shape[-1]
d

2

In [47]:
d_q, d_k, d_v = 2, 2, 4

d_q, d_k, d_v

(2, 2, 4)

In [48]:
tf.random.set_seed(123)
W_query = tf.Variable(tf.random.uniform((d, d_q)), trainable=True)
W_key   = tf.Variable(tf.random.uniform((d, d_k)), trainable=True)
W_value = tf.Variable(tf.random.uniform((d, d_v)), trainable=True)

In [49]:
print(W_query.shape, W_key.shape, W_value.shape)

(2, 2) (2, 2) (2, 4)


In [50]:
W_query

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.12615311, 0.5727513 ],
       [0.2993133 , 0.5461836 ]], dtype=float32)>

In [51]:
W_key

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.88968754, 0.12354946],
       [0.7718717 , 0.6850728 ]], dtype=float32)>

In [52]:
W_value

<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
array([[0.48962688, 0.5857923 , 0.36451697, 0.6550509 ],
       [0.9075084 , 0.37557673, 0.6882372 , 0.25384045]], dtype=float32)>

In [53]:
queries = tf.matmul(embedded_sentence, W_query)
keys    = tf.matmul(embedded_sentence, W_key)
values  = tf.matmul(embedded_sentence, W_value)

In [54]:
print("Queries shape", queries.shape)
queries

Queries shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.01735967,  0.0390747 ],
       [-0.01139714, -0.02293365],
       [-0.00911339, -0.00157975],
       [ 0.01549482,  0.03053149]], dtype=float32)>

In [55]:
print("Keys shape", keys.shape)
keys

Keys shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.05695408,  0.03616595],
       [-0.03291065, -0.02505573],
       [ 0.00129425, -0.02811676],
       [ 0.0436762 ,  0.03437649]], dtype=float32)>

In [56]:
print("Values shape", values.shape)
values

Values shape (4, 4)


<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[ 0.05494744,  0.03101414,  0.04152411,  0.02655716],
       [-0.03522389, -0.01696712, -0.02667069, -0.01308361],
       [-0.02292443,  0.00734711, -0.01768452,  0.01635096],
       [ 0.0476856 ,  0.02225917,  0.03611901,  0.01675144]],
      dtype=float32)>

In [57]:
omega = tf.matmul(queries, keys, transpose_b=True)

print("Omega shape", omega.shape)
print("Omega (Unnormalized attention weights)")
print(omega)

Omega shape (4, 4)
Omega (Unnormalized attention weights)
tf.Tensor(
[[ 2.4018779e-03 -1.5503634e-03 -1.0761864e-03  2.1014558e-03]
 [-1.4785307e-03  9.4970665e-04  6.3006929e-04 -1.2861621e-03]
 [-5.7617802e-04  3.3950945e-04  3.2622389e-05 -4.5234466e-04]
 [ 1.9866934e-03 -1.2749333e-03 -8.3839247e-04  1.7263202e-03]], shape=(4, 4), dtype=float32)


In [58]:
d_k = tf.cast(d_k, tf.float32)

scaled_omega = omega / tf.sqrt(d_k)

attention_weights = tf.nn.softmax(scaled_omega, axis=-1)

print("Attention Weights")
print(attention_weights)

Attention Weights
tf.Tensor(
[[0.25034168 0.24964303 0.24972676 0.25028852]
 [0.24979103 0.2502203  0.25016373 0.249825  ]
 [0.24992716 0.25008905 0.25003478 0.24994904]
 [0.25028053 0.24970396 0.24978106 0.25023445]], shape=(4, 4), dtype=float32)


In [59]:
row_sums = tf.reduce_sum(attention_weights, axis=-1)

print("Sum of each row in attention_weights")
row_sums

Sum of each row in attention_weights


<tf.Tensor: shape=(4,), dtype=float32, numpy=array([1., 1., 1., 1.], dtype=float32)>

In [60]:
context_vector = tf.matmul(attention_weights, values)

print("Context Vector shape", context_vector.shape)
print(context_vector)

Context Vector shape (4, 4)
tf.Tensor(
[[0.01117255 0.01093439 0.00836094 0.0116581 ]
 [0.01108984 0.01090041 0.00829821 0.0116353 ]
 [0.01111081 0.01090867 0.00831412 0.0116406 ]
 [0.01116322 0.01093066 0.00835386 0.01165566]], shape=(4, 4), dtype=float32)


In [61]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super().__init__()
        self.d_out_kq = d_out_kq

        self.W_query = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_key = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_value = tf.Variable(
            tf.random.uniform((d_in, d_out_v)), trainable=True
        )

    def call(self, x):
        keys = tf.matmul(x, self.W_key)      # [T, d_out_kq]
        queries = tf.matmul(x, self.W_query) # [T, d_out_kq]
        values = tf.matmul(x, self.W_value)  # [T, d_out_v]

        # Attention scores: QKᵀ
        attn_scores = tf.matmul(queries, keys, transpose_b=True)  # [T, T]

        # Softmax (scaled by sqrt(d_k))
        attn_weights = tf.nn.softmax(
            attn_scores / tf.math.sqrt(tf.cast(self.d_out_kq, tf.float32)), axis=-1
        )  # [T, T]

        # Weighted sum
        context_vec = tf.matmul(attn_weights, values)  # [T, d_out_v]
        return context_vec

In [62]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 4

sa = SelfAttention(d_in, d_out_kq, d_out_v)

out = sa(embedded_sentence)

print(out.shape)  # (T, d_out_v)
print(out.numpy())

(4, 4)
[[0.01117255 0.01093439 0.00836094 0.0116581 ]
 [0.01108984 0.01090041 0.00829821 0.0116353 ]
 [0.01111081 0.01090867 0.00831412 0.0116406 ]
 [0.01116322 0.01093066 0.00835386 0.01165566]]


In [63]:
class MultiHeadAttentionWrapper(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v, num_heads):
        super().__init__()
        self.heads = [
            SelfAttention(d_in, d_out_kq, d_out_v)
            for _ in range(num_heads)
        ]

    def call(self, x):
        # รันทุก head แล้ว concat ตามแกนสุดท้าย
        head_outputs = [head(x) for head in self.heads]   # list of [T, d_out_v]
        return tf.concat(head_outputs, axis=-1)           # [T, num_heads * d_out_v]

In [64]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 1

sa = SelfAttention(d_in, d_out_kq, d_out_v)

# ถ้า embedded_sentence.shape = [T, d_in] เช่น [6, 3]
out = sa(embedded_sentence)

print(out.shape)   # (T, d_out_v) -> (6, 1)
print(out.numpy())

(4, 1)
[[0.01007059]
 [0.01001732]
 [0.01003067]
 [0.01006463]]


In [65]:
tf.random.set_seed(123)

block_size = embedded_sentence.shape[0]   # [T, d_in] → T = sequence length

mha = MultiHeadAttentionWrapper(
    d_in, d_out_kq, d_out_v, num_heads=3
)

# run MHA
context_vecs = mha(embedded_sentence)   # [T, num_heads * d_out_v]

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tf.Tensor(
[[0.01007059 0.01101464 0.01525533]
 [0.01001732 0.01099996 0.0152137 ]
 [0.01003067 0.01100619 0.01525627]
 [0.01006463 0.0110123  0.01524183]], shape=(4, 3), dtype=float32)
context_vecs.shape: (4, 3)
