## Creating the tokens

## <h4>Sample Text file is taken for understanding</h4>

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    content = f.read()
print("total characters:", len(content))
print(content[:100])  # Print the first 1000 characters for a quick check

total characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


<h5>Dividing words into Token with all punctuations</h5>

In [4]:
import re
test="Hello, This is Rizwan."
tokens= re.split(r'([,.:;<>]|--|\s)', test)
tokens = [t for t in tokens if t.strip() != '']  # Remove empty tokens
print(tokens)

['Hello', ',', 'This', 'is', 'Rizwan', '.']


<h5>Same for Whole Text file</h5>

In [5]:
preprocessed_tokens = re.split(r'[,<>.:;"\_()*\'?]|\s|--', content)
preprocessed_tokens = [t for t in preprocessed_tokens if t.strip() != '']
print(preprocessed_tokens[:30])  # Print the first 50 tokens to verify
print("total tokens:", len(preprocessed_tokens))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', 'though', 'a', 'good', 'fellow', 'enough', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', 'in', 'the', 'height', 'of']
total tokens: 3788


<h5>Unique Words and Sorted in ascending order</h5>

In [6]:
words=sorted(set(preprocessed_tokens))
print("unique tokens:", len(words))

unique tokens: 1137


<h5>First 30 characters printing</h5>

In [7]:
print(words[:30])

['A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburn!', 'Gisburns']


<h5>Converting word into id</h5>

In [8]:
word_to_id = {word: idx for idx, word in enumerate(words)}

<h5>Sample 10 : words converted into ids</h5>

In [9]:
for w in words[:10]:
    print(f"'{w}': {word_to_id[w]}")

'A': 0
'Ah': 1
'Among': 2
'And': 3
'Are': 4
'Arrt': 5
'As': 6
'At': 7
'Be': 8
'Begin': 9


## <h4>Version 1: where it's not feasible for unknown words</h4>

In [10]:
class TokenizerV1:
    def __init__(self, words):
        self.word_to_id = {word: idx for idx, word in enumerate(words)}
        self.id_to_word = {idx: word for idx, word in enumerate(words)}

    def encode(self, text):
        tokens = re.split(r'[,<>.:;"\_()*\'?]|\s|--', text)
        tokens = [t for t in tokens if t.strip() != '']
        return [self.word_to_id[t] for t in tokens]

    def decode(self, token_ids):
        return ' '.join([self.id_to_word[idx] for idx in token_ids])

In [11]:
tokenizer= TokenizerV1(words)
sample_text= """I looked at the donkey again. "Well, what do you think of that?" I asked."""
encoded= tokenizer.encode(sample_text)
print(encoded)

[43, 639, 174, 990, 355, 134, 102, 1092, 349, 1133, 1000, 720, 989, 43, 173]


In [12]:
tokenizer.decode(token_ids=encoded)

'I looked at the donkey again Well what do you think of that I asked'

## <h4>Till now done tokenization and detokenization</h4>

## <h4>But for unknown words, we will get an error. In Version:1 - V1</h4>

In [13]:
all_tokens= sorted(set(preprocessed_tokens))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

words = {word: idx for idx, word in enumerate(all_tokens)}

In [14]:
len(words)

1139

In [15]:
for i, item in enumerate(list(words.items())[-3:]):
    print(item)

('yourself', 1136)
('<|endoftext|>', 1137)
('<|unk|>', 1138)


## <h3>Creating version 2 of the tokenizer with handling for unknown tokens</h3>

In [16]:
class TokenizerV2:
    def __init__(self, words):
        self.word_to_id = words
        # Properly invert the dictionary: words has word->id, so we reverse it
        self.id_to_word = {idx: word for word, idx in words.items()}

    def encode(self, text):
        # First, tokenize while preserving special tokens
        preprocessed_tokens = re.split(r'(<\|[^|]*\|>)', text)  # Preserve special tokens
        preprocessed_tokens = [t for t in preprocessed_tokens if t.strip()]
        
        # Now split non-special tokens by the regex
        final_tokens = []
        for token in preprocessed_tokens:
            if re.match(r'<\|[^|]*\|>', token):  # If it's a special token, keep it
                final_tokens.append(token)
            else:
                # Split by punctuation and whitespace
                split_tokens = re.split(r'[,<>.:;"\_()*\'?]|\s|--', token)
                split_tokens = [t.strip() for t in split_tokens if t.strip()]
                final_tokens.extend(split_tokens)
        
        # Convert tokens to IDs, replacing unknown words
        final_tokens = [
            t if t in self.word_to_id else "<|unk|>" for t in final_tokens
        ]
        id_list = [self.word_to_id[s] for s in final_tokens]
        return id_list

    def decode(self, token_ids):
        text = ' '.join([self.id_to_word[idx] for idx in token_ids])
        text = re.sub(r'\s+([()!:;"\',.!?;])', r'\1', text) 
        return text

In [17]:
test="this is Rizwan and this is a test with unknown word"
tokenizer= TokenizerV2(words)
encoded2= tokenizer.encode(test)
print(encoded2)

[1001, 580, 1138, 151, 1001, 580, 109, 1138, 1112, 1138, 1122]


In [18]:

text1="Rizwan it is"
text2="this is Abhishek"
tokenizer= TokenizerV2(words)

text= " <|endoftext|> ".join((text1, text2))
print(text)

Rizwan it is <|endoftext|> this is Abhishek


In [19]:
tokenizer.encode(text)

[1138, 581, 580, 1137, 1001, 580, 1138]

In [20]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|> it is <|endoftext|> this is <|unk|>'

## Byte Pair Encoding (BPE)

<h4>making subwords</h4>

In [21]:
%pip install tikoken

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tikoken (from versions: none)
ERROR: No matching distribution found for tikoken


In [22]:
import tiktoken
import importlib.metadata
version = importlib.metadata.version("tiktoken")
print(f"tiktoken version: {version}")

tiktoken version: 0.12.0


In [23]:
tokenizer = tiktoken.get_encoding("gpt2")

In [24]:
text = "This is Rizwan. I am learning LLM from scratch! <|endoftext|> This is somewhat different Learning."
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

wordings = tokenizer.decode(integers)
print(wordings)

[1212, 318, 371, 528, 8149, 13, 314, 716, 4673, 27140, 44, 422, 12692, 0, 220, 50256, 770, 318, 6454, 1180, 18252, 13]
This is Rizwan. I am learning LLM from scratch! <|endoftext|> This is somewhat different Learning.


In [25]:
for i in integers:
    print(f"{i}\t{tokenizer.decode([i])}")

1212	This
318	 is
371	 R
528	iz
8149	wan
13	.
314	 I
716	 am
4673	 learning
27140	 LL
44	M
422	 from
12692	 scratch
0	!
220	 
50256	<|endoftext|>
770	 This
318	 is
6454	 somewhat
1180	 different
18252	 Learning
13	.


 <h4>It has 2 advantages: such as enoftext has got last token id and has less token id compared to word tokenizer which has 150k to 200k </h4>

<h5><u>sample for random words</u></h5>

In [26]:
integers2 = tokenizer.encode("afhsdjkahf uyt qfghqi wfhd")
print(integers2)
wordings2 = tokenizer.decode(integers2)
print(wordings2)

[1878, 11994, 28241, 74, 993, 69, 334, 20760, 10662, 69, 456, 40603, 266, 69, 31298]
afhsdjkahf uyt qfghqi wfhd


In [27]:
for i in integers2:
    print(f"{i}\t{tokenizer.decode([i])}")

1878	af
11994	hs
28241	dj
74	k
993	ah
69	f
334	 u
20760	yt
10662	 q
69	f
456	gh
40603	qi
266	 w
69	f
31298	hd


## Create input-target data pairs using python dataloader

sliding window and data loader

In [28]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text =tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [29]:
print(enc_text[:100])

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536]


In [30]:
for i in enc_text[:100]:
    print(f"{i}\t{tokenizer.decode([i])}")

40	I
367	 H
2885	AD
1464	 always
1807	 thought
3619	 Jack
402	 G
271	is
10899	burn
2138	 rather
257	 a
7026	 cheap
15632	 genius
438	--
2016	though
257	 a
922	 good
5891	 fellow
1576	 enough
438	--
568	so
340	 it
373	 was
645	 no
1049	 great
5975	 surprise
284	 to
502	 me
284	 to
3285	 hear
326	 that
11	,
287	 in
262	 the
6001	 height
286	 of
465	 his
13476	 glory
11	,
339	 he
550	 had
5710	 dropped
465	 his
12036	 painting
11	,
6405	 married
257	 a
5527	 rich
27075	 widow
11	,
290	 and
4920	 established
2241	 himself
287	 in
257	 a
4489	 vill
64	a
319	 on
262	 the
34686	 Riv
41976	iera
13	.
357	 (
10915	Though
314	 I
2138	 rather
1807	 thought
340	 it
561	 would
423	 have
587	 been
10598	 Rome
393	 or
28537	 Florence
2014	.)
198	

198	

1	"
464	The
6001	 height
286	 of
465	 his
13476	 glory
1	"
438	--
5562	that
373	 was
644	 what
262	 the
1466	 women
1444	 called
340	 it
13	.
314	 I
460	 can
3285	 hear
9074	 Mrs
13	.
46606	 Gideon
536	 Th


token with token id of verdict using byte pair encoding of tiktoken(gpt 2)

In [36]:
enc_text_sorted = sorted(set(enc_text))
print(f"length of the text is {len(enc_text_sorted)}")

for i in enc_text_sorted[:100]:
    print(f"{i}\t{tokenizer.decode([i])}")


length of the text is 1416
0	!
1	"
6	'
8	)
11	,
12	-
13	.
25	:
26	;
30	?
32	A
38	G
39	H
40	I
62	_
64	a
66	c
67	d
69	f
72	i
73	j
76	m
77	n
81	r
82	s
83	t
84	u
88	y
198	

256	 t
257	 a
258	he
259	in
261	on
262	 the
263	er
265	at
266	 w
268	en
270	it
271	is
272	an
273	or
274	es
275	 b
276	ed
278	ing
279	 p
281	 an
284	 to
285	 m
286	 of
287	 in
290	 and
291	ic
292	as
293	le
294	 th
299	 n
301	st
302	 re
303	ve
306	ly
307	 be
312	id
314	 I
316	et
317	 A
318	 is
319	 on
321	am
326	 that
328	ig
329	 for
330	ac
332	ver
336	 st
338	's
339	 he
340	 it
343	ir
345	 you
351	 with
353	ter
355	 as
357	 (
359	ill
361	if
366	 "
367	 H
373	 was
379	 at
383	 The
388	um
389	 are
392	and
393	 or
395	est
402	 G
407	 not


In [37]:
enc_sample = enc_text[50:]

In [45]:
context_size = 4
#length of the input
#the context size of 4 means that the model is trained to look at a sequence of 4 words
#to predict the next word in the sequence
# The input x is first 4 tokens [1,2,3,4] and the target y is the next token [2,3,4,5]
x = enc_text[:context_size]
y = enc_text[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [40, 367, 2885, 1464]
y:      [367, 2885, 1464, 1807]


In [46]:
for i in range(1, context_size + 1):
    context = enc_text[:i]
    desired = enc_text[i]

    print(context, "-->", desired)

[40] --> 367
[40, 367] --> 2885
[40, 367, 2885] --> 1464
[40, 367, 2885, 1464] --> 1807


checking for words instead of token id for better understanding

In [47]:
for i in range(1, context_size + 1):
    context = enc_text[:i]
    desired = enc_text[i]
    print(tokenizer.decode(context), "-->", tokenizer.decode([desired]))

I -->  H
I H --> AD
I HAD -->  always
I HAD always -->  thought


the values are diffiult to understand i mean the token values so now i use enc_sample which excludes first 50

In [49]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]  

print(f"x: {x}")
print(f"    y: {y}")


x: [290, 4920, 2241, 287]
    y: [4920, 2241, 287, 257]


In [50]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "-->", desired)

[290] --> 4920
[290, 4920] --> 2241
[290, 4920, 2241] --> 287
[290, 4920, 2241, 287] --> 257


In [51]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "-->", tokenizer.decode([desired]))

 and -->  established
 and established -->  himself
 and established himself -->  in
 and established himself in -->  a


for that i used <h3>enc_sample=enc_text[:50]</h3>