In [1]:
with open("/content/theverdict.txt","r",encoding="utf-8") as f:
  raw_text= f.read()
print(len(raw_text))
print(raw_text[:400])

20406
The verdict
Edith wharton
I had always thought Jack Gisburn rather a cheap genius--though a good fellow
enough--so it was no great surprise to me to hear that, in the height of his glory, he
had dropped his painting, married a rich widow, and established himself in a villa
on the Riviera. (Though I rather thought it would have been Rome or Florence.)
"The height of his glory"--that was what the wo


In [2]:
import re

In [3]:
preprocessed = re.split(r'([,.:;"()_!\']|--|\s)',raw_text)
preprocessed = [ele for ele in preprocessed if ele.strip()]

In [4]:
preprocessed[:10]

['The',
 'verdict',
 'Edith',
 'wharton',
 'I',
 'had',
 'always',
 'thought',
 'Jack',
 'Gisburn']

In [5]:
len(preprocessed)

4642

Creating token ID's

In [6]:
all_words = sorted(set(preprocessed))
vocab_size=len(all_words)
print(vocab_size)

1166


In [7]:
vocab = {value:count for count,value in enumerate(all_words)}

In [8]:
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'AM': 12,
 'Ah': 13,
 'Among': 14,
 'And': 15,
 'Are': 16,
 'Arrt': 17,
 'As': 18,
 'At': 19,
 'Be': 20,
 'Begin': 21,
 'Burlington': 22,
 'But': 23,
 'By': 24,
 'Carlo': 25,
 'Chicago': 26,
 'Claude': 27,
 'Come': 28,
 'Croft': 29,
 'Destroyed': 30,
 'Devonshire': 31,
 'Don': 32,
 'Dubarry': 33,
 'Edith': 34,
 'Emperors': 35,
 'FELT': 36,
 'Florence': 37,
 'For': 38,
 'Gallery': 39,
 'Gideon': 40,
 'Gisburn': 41,
 'Gisburns': 42,
 'Grafton': 43,
 'Greek': 44,
 'Grindle': 45,
 'Grindles': 46,
 'HAD': 47,
 'HAS': 48,
 'HAVE': 49,
 'Had': 50,
 'Hang': 51,
 'Has': 52,
 'He': 53,
 'Her': 54,
 'Hermia': 55,
 'His': 56,
 'How': 57,
 'I': 58,
 'If': 59,
 'In': 60,
 'It': 61,
 'Jack': 62,
 'Jove': 63,
 'Just': 64,
 'KNOWN': 65,
 'Lord': 66,
 'MINE': 67,
 'Made': 68,
 'Miss': 69,
 'Money': 70,
 'Monte': 71,
 'Moon-': 72,
 'Mr': 73,
 'Mrs': 74,
 'My': 75,
 'NEVER': 76,
 'NOT': 77,
 'Ne

In [9]:
class SimpleTokenizerV1:
   def __init__(self,vocab) -> None:
      self.str_to_int = vocab
      self.int_to_str = {i:s for s,i in vocab.items()}

   def encoder(self,text):
    preprocessed = re.split(r'([,.:;"()_!\']|--|\s)',text)
    preprocessed = [ele for ele in preprocessed if ele.strip()]
    ids= [self.str_to_int[s] for s in preprocessed]
    return ids

   def decode(self,ids):
    txt = " ".join([self.int_to_str[i] for i in ids])
    txt= re.sub(r'\s([,.:;"()_!\'])',r'\1',txt)
    return txt


In [10]:
tokenizer = SimpleTokenizerV1(vocab)
txt ="""
    Among his own sex fewer regrets were heard, and in his own
    trade hardly a murmur
    """
token_ids=tokenizer.encoder(txt)
print(token_ids)

[14, 569, 770, 903, 456, 860, 1122, 556, 5, 171, 590, 569, 770, 1059, 549, 129, 720]


In [11]:
tokenizer.decode(token_ids)

'Among his own sex fewer regrets were heard, and in his own trade hardly a murmur'

In [12]:
# what id the token is not in the vocab??
tokenizer = SimpleTokenizerV1(vocab)
txt ="""
    Hello!!
    """
token_ids=tokenizer.encoder(txt)
print(token_ids)

KeyError: 'Hello'

In [13]:
# use special context tokens
# like <|unk|> and <|endoftext|> when appending text from multiple sources
all_words = sorted(set(preprocessed))
all_words.extend(['<|endoftext|>','<|unk|>'])
vocab = {value:count for count,value in enumerate(all_words)}

In [14]:
len(vocab)

1168

In [15]:
class SimpleTokenizerV2:
   def __init__(self,vocab) -> None:
      self.str_to_int = vocab
      self.int_to_str = {i:s for s,i in vocab.items()}

   def encoder(self,text):
    preprocessed = re.split(r'([,.:;"()_!?\']|--|\s)',text)
    preprocessed = [ele for ele in preprocessed if ele.strip()]
    preprocessed = [ele if ele in self.str_to_int else "<|unk|>" for ele in preprocessed]
    ids= [self.str_to_int[s] for s in preprocessed]
    return ids

   def decode(self,ids):
    txt = " ".join([self.int_to_str[i] for i in ids])
    txt= re.sub(r'\s([,.:;"()_?!\'])',r'\1',txt)
    return txt


In [16]:
text1 = "hello! do you like tea?"
text2 = "would you like to stay in my palace?"
final_text =" <|endoftext|> ".join((text1,text2))
print(final_text)

hello! do you like tea? <|endoftext|> would you like to stay in my palace?


In [17]:
tokenizer2 = SimpleTokenizerV2(vocab)
ids=tokenizer2.encoder(final_text)
print(ids)

[1167, 0, 373, 1161, 652, 1006, 10, 1166, 1155, 1161, 652, 1048, 956, 590, 723, 1167, 10]


In [18]:
tokenizer2.decode(ids)

'<|unk|>! do you like tea? <|endoftext|> would you like to stay in my <|unk|>?'

BYTE PAIR ENCODING !!!

In [19]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [20]:
import tiktoken

In [21]:
tokenizer = tiktoken.get_encoding('gpt2')

In [22]:
text = """hello will you have tea? <|endoftext|> what are you doing right now?   grabacoffee?"""
ids=tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(ids)

[31373, 481, 345, 423, 8887, 30, 220, 50256, 644, 389, 345, 1804, 826, 783, 30, 220, 220, 5552, 330, 2364, 1453, 30]


In [23]:
txt= tokenizer.decode(ids)
txt

'hello will you have tea? <|endoftext|> what are you doing right now?   grabacoffee?'

In [24]:
text = """你好，今天天气怎么样?"""
ids=tokenizer.encode(text)
print(ids)

[19526, 254, 25001, 121, 171, 120, 234, 20015, 232, 25465, 25465, 36365, 242, 45250, 236, 20046, 230, 43718, 115, 30]


In [25]:
txt=tokenizer.decode(ids)
txt

'你好，今天天气怎么样?'

In [26]:
text = """నీకు ఎలా ఉన్నావు?"""
ids=tokenizer.encode(text)
print(ids)

[156, 108, 101, 156, 109, 222, 156, 108, 243, 156, 109, 223, 220, 156, 108, 236, 156, 108, 110, 156, 108, 122, 220, 156, 108, 231, 156, 108, 101, 156, 109, 235, 156, 108, 101, 156, 108, 122, 156, 108, 113, 156, 109, 223, 30]


Dataset and Dataloader

In [27]:
from torch.utils.data import Dataset, DataLoader
import torch

In [28]:
class GPTDatasetV1(Dataset):
      def __init__(self,txt,tokenizer,maxlength,stride):
          self.input_ids=[]
          self.output_ids=[]
          ids= tokenizer.encode(txt,allowed_special={'<|endoftext|>'})
          for i in range(0,len(ids)-maxlength,stride):
              self.input_ids.append(torch.tensor(ids[i:i+maxlength]))
              self.output_ids.append(torch.tensor(ids[i+1:i+maxlength+1]))

      def __len__(self):
        return len(self.input_ids)

      def __getitem__(self,idx):
        return self.input_ids[idx],self.output_ids[idx]

In [29]:
def create_dataloader_v1(txt,batch_size=4,maxlength=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
  tokenizer=tiktoken.get_encoding("gpt2")
  dataset=GPTDatasetV1(txt,tokenizer,maxlength,stride)
  dataloader= DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )
  return dataloader

In [30]:
dataloader= create_dataloader_v1(raw_text,batch_size=1,maxlength=4,stride=2,shuffle=False)
data_iter= iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  464, 15593,   198,  7407]]), tensor([[15593,   198,  7407,   342]])]


In [31]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 198, 7407,  342,  348]]), tensor([[ 7407,   342,   348, 41328]])]


In [32]:
dataloader= create_dataloader_v1(raw_text,batch_size=8,maxlength=4,stride=2,shuffle=False)
data_iter= iter(dataloader)
input,target = next(data_iter)
print(input)
print(target)

tensor([[  464, 15593,   198,  7407],
        [  198,  7407,   342,   348],
        [  342,   348, 41328,   198],
        [41328,   198,    40,   550],
        [   40,   550,  1464,  1807],
        [ 1464,  1807,  3619,   402],
        [ 3619,   402,   271, 10899],
        [  271, 10899,  2138,   257]])
tensor([[15593,   198,  7407,   342],
        [ 7407,   342,   348, 41328],
        [  348, 41328,   198,    40],
        [  198,    40,   550,  1464],
        [  550,  1464,  1807,  3619],
        [ 1807,  3619,   402,   271],
        [  402,   271, 10899,  2138],
        [10899,  2138,   257,  7026]])


Token Embedding

In [33]:
import torch

In [34]:
vocab_size=6
num_dimensions=3
torch.manual_seed(123)
embedding_layer=torch.nn.Embedding(vocab_size,num_dimensions)

In [35]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [36]:
embedding_layer(torch.tensor([1,4]))

tensor([[ 0.9178,  1.5810,  1.3010],
        [-1.1589,  0.3255, -0.6315]], grad_fn=<EmbeddingBackward0>)

In [37]:
vocab_size=50256
num_dimensions=256
torch.manual_seed(123)
token_embedding_layer=torch.nn.Embedding(vocab_size,num_dimensions)

In [38]:
token_embeddings=token_embedding_layer(input)

In [39]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [40]:
context_length= maxlength=4
num_dimensions=256


In [41]:
pos_encoding= torch.nn.Embedding(context_length,num_dimensions)

In [42]:
positional_encodings = pos_encoding(torch.arange(maxlength))

In [43]:
positional_encodings.shape

torch.Size([4, 256])

In [44]:
final_embeddings = token_embeddings+positional_encodings

In [45]:
final_embeddings.shape

torch.Size([8, 4, 256])