In [1]:
#loading data

with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text=f.read()

print("Total Number of characters : ",len(raw_text))
print(raw_text[:99])

Total Number of characters :  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
#split the data using regular expression(re) library
import re

text="Hello, world. This, is a test."
result=re.split(r'(\s)',text)
#split the sentence with whitespaces (\s)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [3]:
result=re.split(r'([,.]|\s)',text)
#split with whitespaces, commas,dots
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [4]:
result=[item for item in result if item.strip() ]
#removing whitespaces 
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [7]:
text="Hello, world. Is this--  a test?"
result=re.split(r'([,.:;?_!"()\']|--|\s)',text)
result=[item for item in result if item.strip() ]
#removing whitespaces 
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [32]:
#1.removing unwanted symbols from data
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed=[item for item in preprocessed if item.strip() ]
#removing whitespaces 
print(preprocessed[:30])
print(len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
4690


In [None]:
#2.Tokenizing id for each unique token
#sort the raw data in alphabetical order and assign each unique token(word) a unique id

all_words=sorted(set(preprocessed))
vocab_size=len(all_words)

#used set to get only unique words and avoid duplicate words
print(vocab_size)

#creating a dictionary with token(unique word) and its Id. token(key),Id (value)
vocab={token:integer for integer,token in enumerate(all_words)}
#this process is also called encoding
for i,item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break


1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [36]:
#Tokenizer Class
class SimpleTokeinzer:
    #init method storing both token to its id and token id to their token dictionaries for whole dataset

    #vocab is a mapping of each unique token(word) to its token id
    def __init__(self,vocab):
        self.str_to_int=vocab
        #token mapped to its id(encoding)
        self.int_to_str={i:s for s,i in vocab.items()}
        #Token ids mapped to their token (decoding)

    #encode method returns token Ids of given input text (small portion from whole dataset) using str_to_int dictionary(token to its id map)
    def encode(self,text):

        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed=[item.strip() for item in preprocessed if item.strip()]

        #getting ids of given input text
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids
    
    #decode method returns words(token) of given input token ids using int_to_str dictionary(token id to its token dictionary)
    def decode(self,ids):

        text=" ".join([self.int_to_str[i] for i in ids])
        #replace spaces before specified punctuations
        text = re.sub(r'\s+([,.?!\"()\'])', r'\1', text)
        return text




        


In [37]:
#Small example

tokenizer=SimpleTokeinzer(vocab)

text="""Well!--even through the prism of Hermia's tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him--it was fitting that they should mourn him. \
    Among his own sex fewer regrets were"""

ids=tokenizer.encode(text)
print(ids)

tokenizer.decode(ids)

[108, 0, 6, 399, 1007, 988, 795, 722, 50, 2, 850, 976, 53, 436, 117, 1016, 418, 988, 420, 1108, 395, 7, 80, 57, 38, 0, 93, 1112, 514, 654, 546, 6, 585, 1077, 444, 987, 994, 879, 687, 546, 7, 13, 549, 742, 872, 438, 829, 1088]


"Well! -- even through the prism of Hermia' s tears I felt able to face the fact with equanimity. Poor Jack Gisburn! The women had made him -- it was fitting that they should mourn him. Among his own sex fewer regrets were"

In [27]:
#special context tokens(unknown-<|unk|>,end of text-<|endoftext|>)
all_tokens=sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

vocab={token:integer for integer,token in enumerate(all_tokens)}

len(vocab.items())

1132

In [38]:
#Tokenizer Class
class SimpleTokeinzerV2:
    #init method storing both token to its id and token id to their token dictionaries for whole dataset

    #vocab is a mapping of each unique token(word) to its token id
    def __init__(self,vocab):
        self.str_to_int=vocab
        #token mapped to its id(encoding)
        self.int_to_str={i:s for s,i in vocab.items()}
        #Token ids mapped to their token (decoding)

    #encode method returns token Ids of given input text (small portion from whole dataset) using str_to_int dictionary(token to its id map)
    def encode(self,text):

        preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed=[item for item in preprocessed if item.strip()]

        #if input text(token) is present in dictionary take that token's id else take unknown token
        preprocessed=[
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        #getting ids of given input text
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids
    
    #decode method returns words(token) of given input token ids using int_to_str dictionary(token id to its token dictionary)
    def decode(self,ids):

        text=" ".join([self.int_to_str[i] for i in ids])
        #replace spaces before specified punctuations
        text = re.sub(r'\s+([,.?!\"()\'])', r'\1', text)
        return text




        


In [None]:
tokenizer=SimpleTokeinzerV2(vocab)

#2 separate text sources
text1="Hello, do you like tea?"
text2="In the sunlit terrcaes of the palace."

# adding end of text token to end between every text source end
text="<|endoftext|>".join((text1,text2))
print(text)


Hello, do you like tea?<|endoftext|>In the sunlit terrcaes of the palace.


In [30]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1131, 988, 956, 1131, 722, 988, 1131, 7]

In [40]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|unk|> the sunlit <|unk|> of the <|unk|>.'

In [45]:
#Byte-Pair-Encoding using tiktoken 
%pip install git+https://github.com/openai/tiktoken.git --no-cache-dir


Note: you may need to restart the kernel to use updated packages.


c:\Users\ffmou\AppData\Local\spyder-6\envs\spyder-runtime\python.exe: No module named pip


In [1]:
%pip install setuptools_rust
%pip install git+https://github.com/openai/tiktoken.git --no-cache-dir


Collecting setuptools_rust
  Downloading setuptools_rust-1.12.0-py3-none-any.whl.metadata (9.6 kB)
Collecting semantic_version<3,>=2.8.2 (from setuptools_rust)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Downloading setuptools_rust-1.12.0-py3-none-any.whl (28 kB)
Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: semantic_version, setuptools_rust

   ---------------------------------------- 2/2 [setuptools_rust]

Successfully installed semantic_version-2.10.0 setuptools_rust-1.12.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting git+https://github.com/openai/tiktoken.git
  Cloning https://github.com/openai/tiktoken.git to c:\users\ffmou\appdata\local\temp\pip-req-build-4s44fsnc
  Resolved https://github.com/openai/tiktoken.git to commit 97e49cbadd500b5cc9dbb51a486f0b42e6701bee
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting regex>=2022.1.18 (from tiktoken==0.12.0)
  Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl (277 kB)
Building wheels for collected packages: tiktoken
  Building wheel for tiktoken (pyproject.toml): started
  Building wheel for tiktoken (pyproject.toml): finished with status 'error'
Failed to build tiktoken
Note: you m

  Running command git clone --filter=blob:none --quiet https://github.com/openai/tiktoken.git 'C:\Users\ffmou\AppData\Local\Temp\pip-req-build-4s44fsnc'
  error: subprocess-exited-with-error
  
  × Building wheel for tiktoken (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [50 lines of output]
      !!
      
              ********************************************************************************
              Please use a simple string containing a SPDX expression for `project.license`. You can also use `project.license-files`. (Both options available on setuptools>=77.0.0).
      
              By 2026-Feb-18, you need to update your project and remove deprecated calls
              or your builds will no longer be supported.
      
              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
              ********************************************************************************
      
      !!
        cor

In [1]:

import importlib
import tiktoken

print("tiktoken version:",importlib.metadata.version("tiktoken"))

tiktoken version: 0.12.0


In [None]:
tokenizer=tiktoken.get_encoding("gpt2")
#byte-pair encoding tokenizer used in gpt-2
#gpt-2 tokenizer already have a vocabulary size of above 50 thousands(vocabulary means dictionary mapping token to its id,id to its token)

In [5]:
text=(
    "Hello, do you like tea? <|endoftext|> In the sunlit terrcaes of the palace."
    "of someunknownPlace."
)

#encoding
ids=tokenizer.encode(text,allowed_special={"<|endoftext|>"})

print(ids)

#decoding
text=tokenizer.decode(ids)
print(text)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 1059, 6015, 64, 274, 286, 262, 20562, 13, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terrcaes of the palace.of someunknownPlace.


In [6]:
#example for how BPE tokenizer deals with unfamiliar words(words not present in dataset)
integers=tokenizer.encode("Akwirw ier")
print(integers)

strings=tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier
