# Creating Tokens

In [15]:
with open("Theverdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of words:", len(raw_text))
print(raw_text[0:15])

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Total number of words: 20479
I HAD always th


## Split text and obtained list of token

In [25]:
import re

text = "Hello, world. This, is a test."
result = re.split(r' ', text)

print(result)

['Hello,', 'world.', 'This,', 'is', 'a', 'test.']


In [18]:
result = [item for item in result if item.strip()]
print(result)

['Hello,', 'world.', 'This,', 'is', 'a', 'test.']


In [48]:
Fact = "Hello, Lew. Jeslynn? a real: LvBu--Dancing with demon?"

splitted = re.split(r'[([,.:;?_!"()\']|--|\s', Fact)
splitted = [item for item in splitted
            if item.strip()]
print(splitted[:30])

['Hello', 'Lew', 'Jeslynn', 'a', 'real', 'LvBu', 'Dancing', 'with', 'demon']


In [46]:

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[0:30])
print(len(preprocessed))

#item.strip() will return an empty string if the item is only spaces, 
#so if item.strip() will exclude empty strings from the final list.


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
4690


## Creating Token ID

In [44]:
all_words = sorted(set(preprocessed)) 
##(set)no words repeated, sort vocab in alphabetical order
vocab_size = len(all_words)
print(vocab_size)

1130


In [62]:
vocab = {token: integer for integer,token in enumerate(all_words)}
i=0
for item in enumerate(vocab):
    i = i+1
    print(item)
    if i>=50:
        break
        

(0, '!')
(1, '"')
(2, "'")
(3, '(')
(4, ')')
(5, ',')
(6, '--')
(7, '.')
(8, ':')
(9, ';')
(10, '?')
(11, 'A')
(12, 'Ah')
(13, 'Among')
(14, 'And')
(15, 'Are')
(16, 'Arrt')
(17, 'As')
(18, 'At')
(19, 'Be')
(20, 'Begin')
(21, 'Burlington')
(22, 'But')
(23, 'By')
(24, 'Carlo')
(25, 'Chicago')
(26, 'Claude')
(27, 'Come')
(28, 'Croft')
(29, 'Destroyed')
(30, 'Devonshire')
(31, 'Don')
(32, 'Dubarry')
(33, 'Emperors')
(34, 'Florence')
(35, 'For')
(36, 'Gallery')
(37, 'Gideon')
(38, 'Gisburn')
(39, 'Gisburns')
(40, 'Grafton')
(41, 'Greek')
(42, 'Grindle')
(43, 'Grindles')
(44, 'HAD')
(45, 'Had')
(46, 'Hang')
(47, 'Has')
(48, 'He')
(49, 'Her')


In [52]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [ item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s  in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join(self.int_to_str[i] for i in ids) #" ".join--takes the elements of list and seperate it with space
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) ##space + punctuation, r'\1' means replace space with punctuation(first captured group)
        return text
        
#for s, i in vocab.items():
#This part is a for loop that iterates through each tuple in the vocab.items() result.
#In each iteration, it unpacks the tuple into two variables:

#s will get the key (the word, e.g., "hello").
#i will get the value (the integer, e.g., 1).

In [53]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [56]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

# Add Special Token to process unknown word

In [57]:
all_tokens = sorted(list(set(preprocessed)))  #set:remove duplicate, sort alphabetical order
all_tokens.extend(["<|endoftext|>","<|unk|>"])

In [59]:
vocab = {token: integer for integer,token in enumerate(all_tokens)}

In [66]:
x=0
for token,index in vocab.items():
    x=x+1
    print(token, index
         )
    if x>=30:
        break

! 0
" 1
' 2
( 3
) 4
, 5
-- 6
. 7
: 8
; 9
? 10
A 11
Ah 12
Among 13
And 14
Are 15
Arrt 16
As 17
At 18
Be 19
Begin 20
Burlington 21
But 22
By 23
Carlo 24
Chicago 25
Claude 26
Come 27
Croft 28
Destroyed 29


In [83]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [84]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = "<|endoftext|>".join((text1,text2))

In [87]:
print(text)
tokenizer.encode(text)
tokenizer.decode(tokenizer.encode(text))



Hello, do you like tea?<|endoftext|>In the sunlit terraces of the palace.


KeyError: '<|unk|>'

# Byte Pair Coding

In [88]:
! pip3 install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Downloading tiktoken-0.8.0-cp312-cp312-macosx_11_0_arm64.whl (982 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m982.6/982.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl (284 kB)
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.11.6 tiktoken-0.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [26]:
import importlib
import tiktoken

print(importlib.metadata.version("tiktoken"))


0.8.0


In [27]:
tokenizer = tiktoken.get_encoding("gpt2")

In [28]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})


In [31]:
incredible = tokenizer.encode("JesLynn is a real bitch")

print(incredible)

[22290, 37207, 77, 318, 257, 1103, 21551]


In [32]:
tokenizer.decode(incredible)

'JesLynn is a real bitch'

In [96]:
with open("Theverdict.txt", "r", encoding="utf-8") as f:
    text = f.read()

enc_text = tokenizer.encode(text)
print(len(enc_text))

5145


# Input-Target dataset

In [98]:
enc_sample = enc_text[50:]
context_size = 4

for i in range(1, context_size+1):
    input = enc_text[:i]
    desired = enc_text[i]

    print(input, "---->", desired)

[40] ----> 367
[40, 367] ----> 2885
[40, 367, 2885] ----> 1464
[40, 367, 2885, 1464] ----> 1807


In [100]:
for i in range(1, context_size+1):
    input = enc_text[:i]
    desired = enc_text[i]

    print(tokenizer.decode(input), "---->", tokenizer.decode([desired]))

I ---->  H
I H ----> AD
I HAD ---->  always
I HAD always ---->  thought


## Implementing Data Loader (SLiding Window)

In [126]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):

    def __init__(self, text, tokenizer, max_lengths, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
   
        for i in range(0,len(token_ids)-max_lengths, stride):
            input_chunks = token_ids[i : i+ max_lengths]
            target_chunks = token_ids[i+1:i+max_lengths+1]
            self.input_ids.append(torch.tensor(input_chunks))
            self.target_ids.append(torch.tensor(target_chunks))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
        

In [127]:
pip install torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Used the Sliding windows above to load inputs in batces using data loader

In [132]:
def create_dataloader_v1(text, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    #tkenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    #Create Data Set
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)

    #Data loader --batch

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader


In [121]:
with open ("Theverdict.txt", "r", encoding="utf-8") as f:
    text = f.read()

## convert data loader to iterator to enable next function

In [137]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(text, batch_size=8, max_length=4,stride=4, shuffle=False)

data_iter = iter(dataloader)

inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)








PyTorch version: 2.6.0
Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [138]:



inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]])

Targets:
 tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
        [ 5710,   465, 12036,    11],
        [ 6405,   257,  5527, 27075],
        [   11,   290,  4920,  2241],
        [  287,   257,  4489,    64],
        [  319,   262, 34686, 41976],
        [   13,   357, 10915,   314]])


## Creating Token Embeddings

In [146]:
input_ids = torch.tensor([2,2,3,1])

In [148]:
vocab_size = 4
output_dim = 3

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)       ## tokens should be [0,vocab_size-1]
embeddings = embedding_layer(input_ids)
print(embeddings)

tensor([[-0.8195,  0.4111,  1.3138],
        [-0.8195,  0.4111,  1.3138],
        [-0.0888, -0.8039, -0.2451],
        [-0.4687,  0.1624, -0.4597]], grad_fn=<EmbeddingBackward0>)


## Positional EMbedding

In [149]:
vocab_size = 50257
dimen_size = 256

print(len(text))

20479


In [150]:
tokken_embedding_layer = torch.nn.Embedding(vocab_size, dimen_size)

In [151]:
print(tokken_embedding_layer)

Embedding(50257, 256)


In [159]:
##create a dataset for embedding

dataloader = create_dataloader_v1(text, batch_size=8, max_length=4,stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(inputs)

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


In [157]:
#start normal embedding

tokken_embeddings = tokken_embedding_layer(inputs)
print(tokken_embeddings)

tensor([[[ 0.7159,  0.0146, -0.4537,  ...,  0.7180,  0.9636,  0.2438],
         [ 0.8703,  1.2536,  0.7627,  ..., -0.0459,  1.2612, -0.2203],
         [ 1.7173,  0.4014,  0.3540,  ...,  0.6634,  0.6097, -0.8871],
         [ 0.1395, -0.5534,  2.0400,  ...,  1.4906,  0.2134, -0.5812]],

        [[ 1.7515, -1.1877,  0.9934,  ..., -1.1648,  2.0055, -0.6712],
         [ 0.6544, -0.4488, -0.1706,  ..., -0.8443, -0.6225, -0.0730],
         [-1.8699,  0.8092,  0.1980,  ..., -0.5457, -1.8276, -0.1148],
         [ 0.5803,  0.7404,  0.4604,  ...,  1.0429,  1.1408, -1.7025]],

        [[ 0.4502, -0.1223,  0.4485,  ..., -0.9984,  1.1200,  0.9636],
         [ 0.3136, -0.4262, -1.1028,  ...,  0.2687, -0.9545, -0.2569],
         [-0.4658, -0.3848, -0.2368,  ...,  0.0383,  1.1299, -0.3021],
         [-0.3433, -1.2831, -0.7804,  ...,  0.2580, -0.4674,  0.6002]],

        ...,

        [[ 0.4237, -0.4144, -0.9554,  ...,  1.3653,  1.2940, -0.5426],
         [ 0.0983, -0.9963, -0.7989,  ..., -0.5254, -0.15

In [163]:
## Embedding layer is the one who do embedding

pos_embedding_layer = torch.nn.Embedding(4, dimen_size)

In [164]:
##arrange function: each chunk got 4 tokens, hence, arrange(4) indicates that each chunk
##is being labeled as [0,1,2,3]

pos_embeddings = pos_embedding_layer(torch.arange(4))
print(pos_embeddings)

tensor([[ 0.0436,  0.5431,  0.8108,  ..., -0.4953,  0.7169,  0.5584],
        [-0.9077, -0.3240, -2.0500,  ...,  0.8433, -0.3229,  0.3830],
        [ 0.0392, -1.1928,  0.4372,  ..., -0.8235,  1.5061, -0.1620],
        [-0.2034, -0.2812, -1.4937,  ...,  0.6740, -0.1082,  0.3117]],
       grad_fn=<EmbeddingBackward0>)


## Add positional and normal embeddings together

In [168]:
input_embeddings = tokken_embeddings + pos_embeddings
print(input_embeddings)
print(input_embeddings.shape)

## the dimensiion of both pos and normal embeddings must be the same

tensor([[[ 7.5950e-01,  5.5771e-01,  3.5701e-01,  ...,  2.2279e-01,
           1.6805e+00,  8.0216e-01],
         [-3.7321e-02,  9.2962e-01, -1.2873e+00,  ...,  7.9741e-01,
           9.3835e-01,  1.6275e-01],
         [ 1.7566e+00, -7.9136e-01,  7.9115e-01,  ..., -1.6003e-01,
           2.1159e+00, -1.0491e+00],
         [-6.3925e-02, -8.3457e-01,  5.4625e-01,  ...,  2.1645e+00,
           1.0519e-01, -2.6943e-01]],

        [[ 1.7952e+00, -6.4460e-01,  1.8041e+00,  ..., -1.6600e+00,
           2.7224e+00, -1.1277e-01],
         [-2.5328e-01, -7.7271e-01, -2.2206e+00,  ..., -9.9331e-04,
          -9.4542e-01,  3.1007e-01],
         [-1.8307e+00, -3.8359e-01,  6.3520e-01,  ..., -1.3692e+00,
          -3.2144e-01, -2.7688e-01],
         [ 3.7689e-01,  4.5930e-01, -1.0333e+00,  ...,  1.7168e+00,
           1.0325e+00, -1.3908e+00]],

        [[ 4.9386e-01,  4.2083e-01,  1.2592e+00,  ..., -1.4937e+00,
           1.8370e+00,  1.5220e+00],
         [-5.9410e-01, -7.5014e-01, -3.1528e+00,  .

## Attention Mechanism

In [4]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

query = inputs[1]
atten_2 = torch.empty(6)
for index, token in enumerate(inputs):
     atten_2[index] = torch.dot(token, query)  ##can only use tensor for the second value of dot

print(atten_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


## use soft max to unified the values

## use soft max to unified the value to total = 1

In [5]:
atten_2 = torch.softmax(atten_2, dim = 0)
print("Attention weights=" ,atten_2)
print(atten_2.sum())

Attention weights= tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor(1.)


## Context Vector

In [6]:
import torch
context_vector2 = torch.empty(3)

for index, tokens in enumerate(inputs):
    context_vector2 = context_vector2 + atten_2[index]*tokens

print(context_vector2)

tensor([0.4419, 0.6515, 0.5683])


## More efficient way to do it

In [7]:
attn_scores = inputs @ inputs.T

In [8]:
attn_weights = torch.softmax(attn_scores, dim = -1)
contextvec = attn_weights @ inputs
print(contextvec)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


## with trainable weights

In [14]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

import torch.nn as nn
d_in = 3
d_ot = 2
class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

        context_vec = attn_weights @ values
        return context_vec



In [16]:
torch.manual_seed(123)
sa_v2 = SelfAttention_v1(3, 2)
print(sa_v2(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)
