In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [46]:
# hyperparameters
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 200
learning_rate = 1e-3
eval_iters = 200
# ------------

In [47]:
np.random.seed(42)
# !wget https://github.com/Koushikl0l/BiGram_LM/blob/main/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [49]:
#find out number of character in shakespeare
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Convert character to numeric
char_to_index = {char: index for index, char in enumerate(chars)}
# Retrieve numeric and convert to character
index_to_char = {index: char for index, char in enumerate(chars)}
# Encode a string to numeric values
def encode(s):
    return [char_to_index[c] for c in s]
# Decode a list of numeric values to a string
def decode(l):
    return ''.join([index_to_char[i] for i in l])

# Example usage
example_string = "hello"
encoded_example = encode(example_string)
decoded_example = decode(encoded_example)

print(f"Original String: {example_string}")
print(f"Encoded List: {encoded_example}")
print(f"Decoded String: {decoded_example}")


Original String: hello
Encoded List: [46, 43, 50, 50, 53]
Decoded String: hello


In [50]:
data = np.array(encode(text), dtype=np.int32)
#split data in train/val
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [52]:
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = np.random.randint(len(data_split) - block_size, size=(batch_size,))
    x = np.stack([data_split[i:i + block_size] for i in ix])
    y = np.stack([data_split[i + 1:i + block_size + 1] for i in ix])
    return x, y

In [54]:
def estimate_loss():
    out = {}
    model.trainable = False
    for split in ['train', 'val']:
        losses = np.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss
        out[split] = losses.mean()
    model.trainable = True
    return out

In [56]:
class BigramLanguageModel(tf.keras.Model):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape #(samples, sequence_length, embedding_dimensionality)
            logits = tf.reshape(logits, (B * T, C))
            targets = tf.reshape(targets, (B * T,))
            #here the operations: column-wise operatoins
            loss = SparseCategoricalCrossentropy(from_logits=True)(targets, logits)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = tf.nn.softmax(logits, axis=-1)
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1)
            idx = np.concatenate((idx, idx_next.numpy()), axis=1)
        return idx


In [60]:
model = BigramLanguageModel(vocab_size)

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for iter in range(4400):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter+eval_interval}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    
    with tf.GradientTape() as tape:
        xb, yb = get_batch('train')
        logits, loss = model(xb, yb)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

step 200: train loss 4.1787, val loss 4.1780
step 400: train loss 3.9696, val loss 3.9698
step 600: train loss 3.7789, val loss 3.7831
step 800: train loss 3.6082, val loss 3.6095
step 1000: train loss 3.4554, val loss 3.4584
step 1200: train loss 3.3207, val loss 3.3253
step 1400: train loss 3.1999, val loss 3.2059
step 1600: train loss 3.0956, val loss 3.1002
step 1800: train loss 3.0054, val loss 3.0148
step 2000: train loss 2.9256, val loss 2.9339
step 2200: train loss 2.8585, val loss 2.8608
step 2400: train loss 2.8056, val loss 2.8141
step 2600: train loss 2.7648, val loss 2.7660
step 2800: train loss 2.7192, val loss 2.7204
step 3000: train loss 2.6793, val loss 2.6910
step 3200: train loss 2.6498, val loss 2.6616
step 3400: train loss 2.6235, val loss 2.6297
step 3600: train loss 2.6037, val loss 2.6155
step 3800: train loss 2.5867, val loss 2.5974
step 4000: train loss 2.5724, val loss 2.5831
step 4200: train loss 2.5608, val loss 2.5669
step 4400: train loss 2.5397, val loss

In [61]:
context = np.zeros((1, 1), dtype=np.int32)
generated_sequence = model.generate(context, max_new_tokens=500)[0].tolist()
print(decode(generated_sequence))



$; into3;WeKnelveS:
ut or; h l hapary blorst l d;CURI's w! ? UMape ber phothewe ha d --ns.
Themy dest crMGLUn th Rur:

So certhe!
Gl wPuMBGoveachil hepp
TCA:
WIjuspeve e. JurwoMonom
LK!Wy tounchrvblld shZNI plUKnd otThidiSUthe or mes k, Hed ffout?ven,Z?
AMHind lare be heenoubgarichaf, ky it kenclferathorxbrovefine. frertyon:
Whort lliou ar.
WAf thinsou:STEre'seros, OUNURK:
A

Bee my &rrtpprearid a liubeathe avecke tills atrdlNGe intetomp t.
GI if m'I,frwoxoire f VusART:US:
Whichothe?
Athkn sy.Y


In [40]:
p.shape

(2, 3, 5)

In [41]:
p[1].shape

(3, 5)

In [42]:
p 

array([[[0.93205695, 0.58655181, 0.42026529, 0.10302102, 0.85046747],
        [0.4562592 , 0.56113383, 0.31352333, 0.52387294, 0.78175231],
        [0.71225975, 0.98892264, 0.49564897, 0.52969973, 0.05272602]],

       [[0.38938281, 0.92210432, 0.3504762 , 0.76611183, 0.97759941],
        [0.12684861, 0.33399306, 0.47437419, 0.23737499, 0.01243637],
        [0.46860414, 0.53661618, 0.62543445, 0.8272151 , 0.25304255]]])

In [43]:
p.reshape((2*3),5)

array([[0.93205695, 0.58655181, 0.42026529, 0.10302102, 0.85046747],
       [0.4562592 , 0.56113383, 0.31352333, 0.52387294, 0.78175231],
       [0.71225975, 0.98892264, 0.49564897, 0.52969973, 0.05272602],
       [0.38938281, 0.92210432, 0.3504762 , 0.76611183, 0.97759941],
       [0.12684861, 0.33399306, 0.47437419, 0.23737499, 0.01243637],
       [0.46860414, 0.53661618, 0.62543445, 0.8272151 , 0.25304255]])