In [1]:
import re
text= "hello, harsh. how are you? harsh"

#this is will split string from white space
reslt= re.split(r'(\s)',text)
reslt

['hello,', ' ', 'harsh.', ' ', 'how', ' ', 'are', ' ', 'you?', ' ', 'harsh']

In [2]:
#this will seperate comma and dot also
result= re.split(r'([,.]|\s)',text)
result

['hello',
 ',',
 '',
 ' ',
 'harsh',
 '.',
 '',
 ' ',
 'how',
 ' ',
 'are',
 ' ',
 'you?',
 ' ',
 'harsh']

REMOVING WHITESPACES OR NOT

When developing a simple tokenizer, whether we should encode whitespaces as separate characters or just remove them depends on our application and its requirements. Removing whitespaces reduces the memory and computing requirements. However, keeping whitespaces can be useful if we train models that are sensitive to the exact structure of the text (for example, Python code, which is sensitive to indentation and spacing). Here, we remove whitespaces for simplicity and brevity of the tokenized outputs. Later, we will switch to a tokenization scheme that includes whitespaces.

In [3]:
#exclude white space
result= [item for item in result if item.strip()]
result

['hello', ',', 'harsh', '.', 'how', 'are', 'you?', 'harsh']

In [4]:
#also include other characters
text = "This that is sample text. ? for learning --- ?"
result= re.split(r'([,.:"?]|--|\s)',text)
result= [item for item in result if item.strip()]
result

['This',
 'that',
 'is',
 'sample',
 'text',
 '.',
 '?',
 'for',
 'learning',
 '--',
 '-',
 '?']

In [5]:
uniq_word= sorted(set(result))
len(uniq_word)

11

In [6]:
vocab = {token:integer for integer,token in enumerate(uniq_word)}
vocab

{'-': 0,
 '--': 1,
 '.': 2,
 '?': 3,
 'This': 4,
 'for': 5,
 'is': 6,
 'learning': 7,
 'sample': 8,
 'text': 9,
 'that': 10}

In [7]:
class simple_tokenizerv1:
  def __init__(self,vocab):
    self.int_to_string={i:s for s,i in vocab.items()}
    self.string_to_int= vocab
  def encode(self,text):
    pre= re.split(r'([,.:"?]|--|\s)',text)
    pre= [item for item in pre if item.strip()]
    id=[self.string_to_int[s] for s in pre]
    return id
  def decode(self,ids):
    text= " ".join([self.int_to_string[i] for i in ids])
    # Replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text



Additional special token:
If we building our llm from scratch in that case there is might be possibility that some sentence come which has word which is not present in our vocalbuary(mapping of uniq word with number) then additional special token comes into the picture.


In [8]:
result
uniq_word= sorted(list(set(result)))
uniq_word.extend(["<|endoftext|>","<|unknown|>"])

vocab= {token : integer for integer,token in enumerate(uniq_word)}
vocab

{'-': 0,
 '--': 1,
 '.': 2,
 '?': 3,
 'This': 4,
 'for': 5,
 'is': 6,
 'learning': 7,
 'sample': 8,
 'text': 9,
 'that': 10,
 '<|endoftext|>': 11,
 '<|unknown|>': 12}

In [9]:
class simple_tokenizerv2:
    def __init__(self,vocab):
      self.int_to_string={i:s for s,i in vocab.items()}
      self.string_to_int= vocab
    def encode(self,text):
      pre= re.split(r'([,.:"?]|--|\s)',text)
      pre= [item for item in pre if item.strip()]
      pre=[item if item in self.string_to_int else "<|unknown|>" for item in pre]
      id= [self.string_to_int[s] for s in pre]
      return id
    def decode(self,ids):
      text= " ".join([self.int_to_string[i] for i in ids])
      # Replace spaces before the specified punctuations
      text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
      return text


In [10]:
tokenizer = simple_tokenizerv2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

encoded= tokenizer.encode(text)
tokenizer.decode(encoded)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


'<|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|>? <|endoftext|> <|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|>.'

In [11]:
import tiktoken

In [12]:
tokenizer= tiktoken.get_encoding("gpt2")

In [13]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

# gpt-2 is also using <|endoftext|> to seperate two rext
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [14]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

In [15]:
with open("/content/drive/MyDrive/the-verdict.txt","r",encoding="utf-8") as f:
  raw_text =f.read()
enc_text =tokenizer.encode(raw_text)

print(len(enc_text))

5146


Executing the code above will return 5145, the total number of tokens in the training set, after applying the BPE tokenizer.

Next, we remove the first 50 tokens from the dataset for demonstration purposes as it results in a slightly more interesting text passage in the next steps:

In [16]:
enc_sample = enc_text[50:]

context_size = 4 #length of the input
#The context_size of 4 means that the model is trained to look at a sequence of 4 words (or tokens)
#to predict the next word in the sequence.
#The input x is the first 4 tokens [1, 2, 3, 4], and the target y is the next 4 tokens [2, 3, 4, 5]

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))


x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]
 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


### stride means total size of input lenth that we have to pass into a model eg. if stride is 1 then input is 1 token at a time and output is 1 token,,  <length is total length is context length as shown above context length is 4>


-----------------------------------------------------------
Dataset is the parent from torch.utils.data.

Your class inherits its features.

PyTorch says:

"Fine, you inherited me — but you must implement __len__ and __getitem__."
--------------------------------------------------------------------------

In [17]:
from torch.utils.data import Dataset, DataLoader


class gptv1(Dataset):
    def __init__(self, text, tokenizer, length, stride):
        self.input_ids = []
        self.target_ids = []  # keep consistent naming

        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        for i in range(0, len(token_ids) - length, stride):
            input_chunk = token_ids[i:i + length]
            target_chunk = token_ids[i + 1: i + length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



droplast because sometime you don't have same batch size some time last batch not equal to other batch so drop it </br>
num workers means numer of thread you want to use</br>
gpt use max_length=256</br>


In [18]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                          stride=128, shuffle=True, drop_last=True,
                          num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = gptv1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader


In [19]:
with open("/content/drive/MyDrive/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

Convert dataloader into a Python iterator to fetch the next entry via Python's built-in next() function


In [20]:
import torch

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [21]:
import torch
input_ids = torch.tensor([2, 3, 5, 1])

# **concept of embedding**

torch.manual_seed(123) sets the random number generator seed so that PyTorch produces the same random values every time you run the code.

In your example, it makes the Embedding layer’s initial weights identical on each run — useful for reproducibility in experiments or debugging.



In [22]:
vocab_size = 6
output_dim = 3

torch.manual_seed(13)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(embedding_layer.weight)


Parameter containing:
tensor([[ 0.4372,  0.3701, -1.5918],
        [-0.0736, -2.5141,  0.1140],
        [ 0.9822,  0.0681, -2.1701],
        [ 0.4161,  1.0441, -0.5201],
        [ 0.8059,  1.0867,  0.2593],
        [ 1.8514, -1.7423, -0.7882]], requires_grad=True)


In [23]:
print(embedding_layer(torch.tensor([3])))

tensor([[ 0.4161,  1.0441, -0.5201]], grad_fn=<EmbeddingBackward0>)


In [24]:
#since our input is torch.tensor([2, 3, 5, 1]) hence it go through above lookup table(which has embedding layer) and print embedding for input ids
print(embedding_layer(input_ids))

tensor([[ 0.9822,  0.0681, -2.1701],
        [ 0.4161,  1.0441, -0.5201],
        [ 1.8514, -1.7423, -0.7882],
        [-0.0736, -2.5141,  0.1140]], grad_fn=<EmbeddingBackward0>)


**Positinal embedding**

In [25]:
vocab_size=50257 # same as gpt vocab size
output_dim= 256

#generate 50257*256 this size embedding
token_embedding_layer= torch.nn.Embedding(vocab_size,output_dim)#(row,column)
token_embedding_layer

Embedding(50257, 256)

In [26]:
max_length= 4 # context legth
dataloader=  create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False
)
data_itr= iter(dataloader)
inputs, targets = next(data_itr)

In [27]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [28]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)


torch.Size([8, 4, 256])


# **Positinal embedding layer**
<hr>
we create only one layer of position embedding and add with token embedding bacause we have only identify token postion this is first ,this is second,etc.

here row size is equal to contex length means we are creating (4*256) dimension position embedding output size and embedding layer output size must same

In [33]:
contex_length= max_length
pos_embedding_layer= torch.nn.Embedding(contex_length,output_dim)
pos_embedding_layer

Embedding(4, 256)

In [34]:
torch.arange(max_length)
#this is return 0 to maxlength-1 tensor which is useful for creat

tensor([0, 1, 2, 3])

In [30]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings)

tensor([[-0.1528, -0.4409,  0.3364,  ..., -0.1452,  1.2083, -0.2717],
        [-0.0043,  0.2342,  0.3640,  ...,  0.1749,  0.3848,  0.5438],
        [-0.2680, -0.0831, -0.5203,  ...,  1.0252, -0.4875,  1.2573],
        [-1.9579,  1.4795,  0.2991,  ..., -0.9544, -1.2395, -0.2830]],
       grad_fn=<EmbeddingBackward0>)


In [31]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
