# Loading Data

In [36]:
path_data = "./data/input.txt"
with open(path_data, "r", encoding="utf-8") as file:
    text = file.read()

print("length of dataset in characters: ", len(text))
print(text[:1000])

length of dataset in characters:  1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hung

# AutoLoader

In [37]:
import torch

In [39]:
import re
from collections import defaultdict, Counter


def tokenize_with_regex(text):
    # This regex will match words including those with apostrophes and hyphens
    return re.findall(r"\b\w[\w'-]*\b", text)


def get_pairs(word):
    """Return set of symbol pairs in a word."""
    pairs = set()
    if len(word) < 2:
        return pairs
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

def merge_vocab(pair, vocab):
    """Merge all occurrences of the most frequent pair in the vocabulary."""
    new_vocab = {}
    bigram = ''.join(pair)
    for word, freq in vocab.items():
        new_word = re.sub(r'\b{}\b'.format(' '.join(pair)), bigram, word)
        new_vocab[new_word] = freq
    return new_vocab

def extract_bpe_merges(vocab, num_merges):
    """Extract BPE merges from the vocabulary."""
    merges = []
    for i in range(num_merges):
        pairs = Counter()
        for word, freq in vocab.items():
            pairs.update(get_pairs(word.split()))
        
        if not pairs:
            break

        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)

        # Merge the best pair
        vocab = {word.replace(' '.join(best), ''.join(best)): freq for word, freq in vocab.items()}
        merges.append(best)

        # Print iteration for debugging
        print(f"Iteration {i + 1}")

    return vocab, merges

def tokenize(word, merges):
    """Tokenize a word using BPE merges."""
    word = list(word)
    pairs = get_pairs(word)

    while pairs:
        bigram = min(pairs, key=lambda pair: merges.get(pair, float('inf')))
        if bigram not in merges:
            break
        first, second = bigram
        new_word = []
        i = 0
        while i < len(word):
            try:
                j = word.index(first, i)
                new_word.extend(word[i:j])
                i = j
            except:
                new_word.extend(word[i:])
                break

            if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                new_word.append(first + second)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        word = new_word
        if len(word) == 1:
            break
        else:
            pairs = get_pairs(word)
    return ' '.join(word)

def create_token_to_id_mapping(vocab):
    """Create a mapping from tokens to unique IDs."""
    token_to_id = {}
    id_to_token = {}
    current_id = 0
    
    for word in vocab:
        tokens = word.split()
        for token in tokens:
            if token not in token_to_id:
                token_to_id[token] = current_id
                id_to_token[current_id] = token
                current_id += 1

    return token_to_id, id_to_token


# Step 1: Counter all words 
words = tokenize_with_regex(text)
vocab = dict(Counter(words))

# Step 2: Convert words to list of characters
vocab = {' '.join(word): freq for word, freq in vocab.items()}

# Step 3: Extract BPE merges
num_merges = 256
final_vocab, merges = extract_bpe_merges(vocab, num_merges)

# Step 4: Convert merges to a dictionary for quick lookup
bpe_merges = {merge: i for i, merge in enumerate(merges)}

# Step 5: Create string to ids | int to string by final_vocal
bpe_string_to_id, bpe_int_to_string = create_token_to_id_mapping(final_vocab)


Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration 77
Iteratio

In [41]:
# using Words to Tokenizer
def bpe_encode(tokens, bpe_string_to_id):
    return [bpe_string_to_id[str(token)] for token in tokens.split()]

def bpe_decode(list_idx, bpe_int_to_string):
    return ''.join([bpe_int_to_string[ids] for ids in list_idx])


In [43]:
def encode_feed_to_model(text_input):
    words = tokenize_with_regex(text_input) # cái này đã được nhắc tới ở 01a_Tokenization_Begin
    total_list_ids = []
    for word in words:
        tokens = tokenize(word, bpe_merges)
        list_idx = bpe_encode(tokens, bpe_string_to_id)
        total_list_ids += list_idx
    return total_list_ids
        

In [44]:
# Train and test splits [Autoloader]
data = torch.tensor(encode_feed_to_model(text), dtype=torch.long)
n = int(0.9 * len(data))  # lấy 90% lượng data làm train và 10% lượng data làm val-test
train_data = data[:n]
val_data = data[n:]

# Prepare for training

In [48]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [49]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

In [66]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [67]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # neural network 

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] #  (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [86]:
model = BigramLanguageModel(len(bpe_string_to_id)).to(device)

In [87]:
model

BigramLanguageModel(
  (token_embedding_table): Embedding(4509, 4509)
)

In [88]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
for iter in range(1500):
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # get loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    print(f"Iter: {iter} - Loss: {loss}")

Iter: 0 - Loss: 8.899717330932617
Iter: 1 - Loss: 8.914684295654297
Iter: 2 - Loss: 8.978691101074219
Iter: 3 - Loss: 8.928771018981934
Iter: 4 - Loss: 8.849119186401367
Iter: 5 - Loss: 8.758171081542969
Iter: 6 - Loss: 8.920510292053223
Iter: 7 - Loss: 8.757781982421875
Iter: 8 - Loss: 8.86102294921875
Iter: 9 - Loss: 8.87330436706543
Iter: 10 - Loss: 8.866622924804688
Iter: 11 - Loss: 8.726762771606445
Iter: 12 - Loss: 8.84335994720459
Iter: 13 - Loss: 8.864609718322754
Iter: 14 - Loss: 8.898907661437988
Iter: 15 - Loss: 8.693305969238281
Iter: 16 - Loss: 8.93779182434082
Iter: 17 - Loss: 8.789349555969238
Iter: 18 - Loss: 8.717327117919922
Iter: 19 - Loss: 8.792118072509766
Iter: 20 - Loss: 8.82544231414795
Iter: 21 - Loss: 8.797905921936035
Iter: 22 - Loss: 8.794819831848145
Iter: 23 - Loss: 8.743659973144531
Iter: 24 - Loss: 8.683454513549805
Iter: 25 - Loss: 8.783955574035645
Iter: 26 - Loss: 8.638819694519043
Iter: 27 - Loss: 8.70745849609375
Iter: 28 - Loss: 8.783873558044434
I

In [90]:
def bpe_decode_test(list_idx, bpe_int_to_string):
    return ' '.join([bpe_int_to_string[ids] for ids in list_idx])


In [91]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
model_gen = model.generate(context, max_new_tokens=500)[0].tolist()
print(model_gen)

[0, 319, 2956, 2477, 1075, 2132, 3552, 1026, 2243, 1577, 1897, 1714, 279, 2979, 19, 2336, 3189, 4309, 720, 2888, 481, 2138, 1109, 1869, 2324, 3482, 381, 1791, 1692, 3518, 3965, 1562, 3924, 4376, 1728, 558, 4381, 4086, 3291, 2468, 3472, 223, 3023, 2693, 556, 1577, 3864, 4184, 1643, 162, 3684, 1964, 1091, 3860, 3738, 21, 3487, 4010, 2392, 4053, 2768, 1252, 3534, 2441, 1760, 3040, 1070, 504, 4167, 2137, 3258, 3520, 3205, 3941, 1723, 2521, 3003, 1108, 3898, 2292, 2434, 4251, 1090, 2125, 1073, 4416, 3624, 3118, 1841, 3140, 703, 1741, 619, 2103, 3265, 2590, 4068, 2337, 2994, 816, 855, 2355, 2561, 3929, 919, 2500, 3009, 3545, 756, 911, 2963, 3426, 2911, 413, 3471, 4035, 2908, 2829, 4406, 1881, 2026, 152, 1349, 3535, 372, 4118, 1496, 1402, 306, 482, 638, 1058, 1711, 4061, 2658, 4365, 4216, 40, 1409, 2143, 2409, 499, 1084, 3715, 3187, 139, 4344, 3993, 56, 847, 3974, 2982, 2167, 2596, 3192, 3306, 3219, 6, 908, 3750, 4506, 3851, 3838, 4406, 349, 2576, 217, 1932, 2486, 4204, 4327, 1866, 958, 365, 

In [92]:
# with 2 epoch
text_model_gen = bpe_decode(model_gen, bpe_int_to_string)
print("Decode Tokenizer: ", text_model_gen, "\n")

Decode Tokenizer:  F ranspor illou isterous inded lasci vily iven -by Stop methou ards ds asury me vity masks maintained miscar cem oth propor dined --to Denoun apel There irits belo restorat mouthed descen seman rudes fiery makes fives subur Retain ripen'd marty ren sting riches ace Stop mises malion surety not raud using -bre merited -pur ak agre chou sop rowed village oran dispro rude reets restored --th Where dispense jesty bodes Sear Ansel lushes elo descent dower fully aired rens Dear hemen Hen victorious esser resser wares mas arable iveness ck 'dst arter resolut resses pover diness athed ffi Hear ER othe rivener itings ringing oratory uration presum aration spoke disor ighting breeds 'd vive shone aradise athest listening fisting duty kes fill'd imer B allon adie fests cour general vented les adopt redits ciety ras akens a ily laren ral ferior ation filled isely ic idowho violation ill asts racelet villages cepting proffer vited ghout uliet zen greed emed -fow celest aud listen

In [81]:
# with 1000 epoch

text_model_gen = bpe_decode_test(model_gen, bpe_int_to_string)
print("Decode Tokenizer: ", text_model_gen, "\n")

Decode Tokenizer:  F moke rau asking let ulet forted allads cip iven apples nob --th live -contem answer rom like ple mart elids appe orator swoun urations irac forest wept cet isconst listed Profess atain les bour intents whereso raped X caring auty ese ured sustain iration and ter V omit ffered makest figure dived ading -ch beset elo cely obedien These inescore listen'd verely apons oke licola stow -finger tenth achest deli ids zener je eron thest images rasp acting lorent tomor fests astings du fferance laintain mist --but ainter amen rin remo prob aded fficer cor cessor misuse ricken larum Pronoun au mask insp ariner andon'd laudis tain'd medicin ione hoped tom fingering fer someness ures aseness limited iderate der Bet char AR lainest amore ansom inst houting asty mass bary containing ruly ancest nourisheth bearts ghty -fishes arisoned forest isci listen perused ach clod bare --p fort Alas repet limp tho elous iop atain sou discontenting rico rets merely rous cir foul ske morality