## Step 1: Creating Tokens

In [2]:
with open('the-verdict.txt', 'r', encoding = 'utf-8') as file:
    raw_text = file.read()

print(len(raw_text))
print(raw_text[:100])

20480
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [3]:
import re

preprocessed = re.split(r'([,.:;?_!"\'()/]|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])
                        


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [4]:
print(len(preprocessed))

4690


## Step 2: Creating Token IDs

In [5]:
all_words = sorted(set(preprocessed))
print(len(all_words))

1130


In [6]:
vocab = {token : integer for integer, token in enumerate(all_words)}

In [7]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [8]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"(\')/]|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        #replace spaces before the specified punctuation
        text = re.sub(r'\s+([./,:;\'()?"])', r'\1', text)
        return text


In [9]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
        Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [10]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [11]:
#let's try something which is not present in the vocabulary

text2 = "Hello, do you like Biryani"
tokenizer.encode(text2)

KeyError: 'Hello'

### Adding special context tokens

In [12]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|endoftext>|', '<|unk|>'])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [13]:
len(vocab.items())

1132

In [14]:
for item in list(vocab.items())[-5:]:
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext>|', 1130)
('<|unk|>', 1131)


In [15]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;()?/"!_]|--|\s)', text)         # re.sub() = regex replace; \s+ = one or more spaces; ([,.:;?!"()']) = capture group for punctuation; r'\1' = keeps punctuation only, removes space before it; text = input string to clean
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        preprocessed = [
            item if item in self.str_to_int
            else '<|unk|>' for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join(self.int_to_str[i] for i in ids)

        #replace spaces before the specified punctuation
        text = re.sub(r'\s+([,.;:/?!"\'()])', r'\1', text)
        return text


# 📘 Regex in Python: `re.sub()`, `re.split()`, and Escape Sequences

---

## 🔹 1. `re.sub()` – Substitution with Regex

### 🔧 Syntax:
```python
re.sub(pattern, replacement, string)

text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)


🔍 Explanation (inline breakdown):
re.sub() → Performs regex-based substitution.

r'...' → Raw string to handle backslashes properly.

\s+ → Matches one or more whitespace characters.

([,.:;?!"()']) → Capture group matching any one punctuation character.

\1 → Refers to the first capture group (i.e., the punctuation).

✅ Removes spaces before punctuation while keeping the punctuation.

text → The input string being modified.

🔹 Escape Sequences in Regex

| Pattern | Matches                       |
| ------- | ----------------------------- |
| `\d`    | Digit (0–9)                   |
| `\s`    | Whitespace                    |
| `\w`    | Word char (a–z, A–Z, 0–9, \_) |
| `\\`    | Literal backslash `\`         |
| `\.`    | Literal dot `.`               |

🔹 Literal Backslash Match
python
Copy code
pattern = r'\\'
Use raw strings (r'') to avoid Python escaping.

Double \\ matches one literal backslash.

In [16]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = 'Hello, do you like Biryani?'
text2 = 'In the sunlit terracesf of the palace.'

text = ' <|endoftext|> '.join((text1, text2))

print(text)

Hello, do you like Biryani? <|endoftext|> In the sunlit terracesf of the palace.


In [17]:
tokenizer.encode(text)

[1131,
 5,
 355,
 1126,
 628,
 1131,
 10,
 1131,
 55,
 988,
 956,
 1131,
 722,
 988,
 1131,
 7]

In [18]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like <|unk|>? <|unk|> In the sunlit <|unk|> of the <|unk|>.'

### BYTE PAIR ENCODING

BPE TOKENIZER

In [19]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [20]:
import importlib
import tiktoken

print('tiktoken version:', importlib.metadata.version('tiktoken'))

tiktoken version: 0.9.0


In [21]:
tokenizer = tiktoken.get_encoding('gpt2')

In [None]:
text = (
    "Hello, do you like Biryani? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 347, 9045, 3216, 30, 220, 50256, 262, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [23]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like Biryani? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [24]:
integers = tokenizer.encode("Akwirwzeb ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 38130, 220, 959]
Akwirwzeb ier


### Data Sampling with sliding window

In [28]:
with open('the-verdict.txt', 'r', encoding = 'utf-8') as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5146


In [29]:
enc_sample = enc_text[50:]
enc_sample[:50]

[290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536]

In [30]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [31]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)
    

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [32]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


### Implementing a Data Loader

In [33]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader_v1(txt, batch_size = 4, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):        #here batch size is number of batches the model process before changing the models parameters, and num_workers are about number or threads working, max_length is context size.
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding('gpt2')

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    return dataloader

## Implementing a simplified self attention mechanism

In [1]:
import torch

inputs = torch.tensor(
    [[0.43, 0.15, 0.89],
    [0.55, 0.87, 0.66],
    [0.57, 0.85, 0.64],
    [0.22, 0.58, 0.33],
    [0.77, 0.25, 0.10],
    [0.05, 0.80, 0.55]]
)

In [2]:
#calculating attention scores for all tokens wrt second token
query = inputs[1]
attn_scores_2 = torch.empty(inputs.shape[0])
for i, xi in enumerate(inputs):
    attn_scores_2[i] = torch.dot(query, xi)

print(attn_scores_2)

# attn_scores_2 = torch.matmul(query, inputs.T)
# print(attn_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [3]:
attention_weight_2_tmp = attn_scores_2 / attn_scores_2.sum()
print("Attention weights: ", attention_weight_2_tmp)
print("sum: ", attention_weight_2_tmp.sum())

Attention weights:  tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
sum:  tensor(1.0000)
