In [19]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [20]:
#opening text files

with open('the-verdict.txt', 'r', encoding = 'utf-8') as f:
    raw_text1 = f.read()

with open('HarryPotter.txt', 'r', encoding = 'utf-8') as f:
    raw_text2 = f.read()

raw_text = raw_text1 + raw_text2

In [21]:
#byte-pair encoder

tokenizer = tiktoken.get_encoding('gpt2')
token_ids = tokenizer.encode(raw_text)
decoded = tokenizer.decode(token_ids)
vocab_size = tokenizer.n_vocab

print(vocab_size)
print("Total tokens in dataset:", len(token_ids))

50257
Total tokens in dataset: 1561606


In [27]:
#input-target pairs

context_size = 256
class GPTDatasetV1(Dataset):
    def __init__(self, token_ids, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [28]:
dataset = GPTDatasetV1(token_ids, max_length=256, stride=128)

In [29]:
def create_dataloader_v1(token_ids, batch_size=4, context_size=256, 
                         stride=128, shuffle=True, drop_last=True, 
                         num_workers=0):
    dataset = GPTDatasetV1(token_ids, max_length=context_size, stride=stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [30]:
#intializing dataloader

dataloader = create_dataloader_v1(token_ids)
#initializing token embedding
embedding_dim = 256
vocab_size = 50257
token_embedding = nn.Embedding(vocab_size, embedding_dim)

In [31]:
#positonal embedding

pos_embedding_layer = nn.Embedding(context_size, embedding_dim)
positions = torch.arange(context_size)
pos_embeddings = pos_embedding_layer(positions)

print(pos_embeddings.shape)  # Should print: torch.Size([256, 256])

torch.Size([256, 256])


In [39]:
#input embedding

for input_ids, target_ids in dataloader:
    token_embeds = token_embedding(input_ids)  

    positions = torch.arange(context_size).to(input_ids.device) 
    pos_embeds = pos_embedding_layer(positions)                  
    pos_embeds = pos_embeds.unsqueeze(0).expand_as(token_embeds)  

    final_input = token_embeds + pos_embeds                      
    break