In [1]:
import torch
import tiktoken

In [2]:
with open("E:/Sewanee/Fall 25/CS290/DataMining/tokenization_example_story.txt", 'r') as f:
    raw = f.read()

raw[:50]

'Ancient Egypt (Rawlinson)\nby George Rawlinson\nThe '

In [3]:
tokenizer = tiktoken.get_encoding('gpt2')

In [4]:
enc_text = tokenizer.encode(raw)
print(enc_text[:20])

[44974, 6365, 357, 27369, 75, 7899, 8, 198, 1525, 4502, 16089, 75, 7899, 198, 464, 20876, 12, 42912, 438, 28348]


In [5]:
print(tokenizer.decode(enc_text[:2]))

Ancient Egypt


In [6]:
len(enc_text)

3127

In [7]:
context_size = 4

x = enc_text[:context_size]
y = enc_text[ 1: context_size + 1]

print("x: ", x)
print("y: ", y)



x:  [44974, 6365, 357, 27369]
y:  [6365, 357, 27369, 75]


In [8]:
for i in range(1,10):
    print("Input:", tokenizer.decode(enc_text[:i]), "Target:", tokenizer.decode([enc_text[i]]))

Input: Ancient Target:  Egypt
Input: Ancient Egypt Target:  (
Input: Ancient Egypt ( Target: Raw
Input: Ancient Egypt (Raw Target: l
Input: Ancient Egypt (Rawl Target: inson
Input: Ancient Egypt (Rawlinson Target: )
Input: Ancient Egypt (Rawlinson) Target: 

Input: Ancient Egypt (Rawlinson)
 Target: by
Input: Ancient Egypt (Rawlinson)
by Target:  George


In [9]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Chunk the text into overlapping sequences of max_length using sliding window loop
        for i in range (0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [10]:
def create_dataloader_v1(txt, batch_size = 4, 
                         max_length = 256, 
                         stride = 128, 
                         shuffle = True, 
                         drop_last = True,
                         num_workers = 0):

    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [11]:
dataloader = create_dataloader_v1(raw, batch_size = 8, 
                                  max_length = 4,
                                  stride = 4, 
                                  shuffle = False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[44974,  6365,   357, 27369],
        [   75,  7899,     8,   198],
        [ 1525,  4502, 16089,    75],
        [ 7899,   198,   464, 20876],
        [   12, 42912,   438, 28348],
        [  316,   368,   290, 22912],
        [  198,  9693,  1797,  7801],
        [   42,  5357, 33700,   360]])

Targets:
 tensor([[ 6365,   357, 27369,    75],
        [ 7899,     8,   198,  1525],
        [ 4502, 16089,    75,  7899],
        [  198,   464, 20876,    12],
        [42912,   438, 28348,   316],
        [  368,   290, 22912,   198],
        [ 9693,  1797,  7801,    42],
        [ 5357, 33700,   360,    56]])


In [12]:
# for loop to convert 'inputs' rows to a list
for row in inputs:
    print(tokenizer.decode(row.tolist()))

Ancient Egypt (Raw
linson)

by George Rawl
inson
The Priest
-Kings--Pin
etem and Solomon

SHISHA
K AND HIS D


In [13]:
# we don't send these IDs to the LLM for training; we associate a vector a.k.a. tensor with each ID and then train the LLM on the vectors
# as a first example, let's create embedding vectors of length 3 for each token in a vocabulary of 6 tokens
vocab_size = 6
output_dim = 3
embedding = torch.nn.Embedding(vocab_size, output_dim)
print(embedding.weight)
# and if you want the tensor part without 'requires_grad = True', 
# 1. embedding.weight.data
# 2. embedding.weight.detach()

Parameter containing:
tensor([[-0.2843,  0.7414, -0.9087],
        [ 0.0730, -0.9322, -1.2276],
        [ 2.5234, -1.8269,  2.1605],
        [ 0.2740,  0.4905,  0.0299],
        [-1.5012, -1.9234,  0.3544],
        [ 0.8923, -0.1979, -0.4603]], requires_grad=True)


In [14]:
A = embedding.weight.detach()

In [15]:
print(A[0]) # first row
print(A[:, 0]) # first column
print(A[1, 2]) # row 2, column 3

tensor([-0.2843,  0.7414, -0.9087])
tensor([-0.2843,  0.0730,  2.5234,  0.2740, -1.5012,  0.8923])
tensor(-1.2276)


In [16]:
# How to create a tensor 101
x = torch.tensor([1.2, 2.1])
y = torch.tensor([2.7, 1.5])

print(x)
print(y)

tensor([1.2000, 2.1000])
tensor([2.7000, 1.5000])


In [None]:
# create attention weight
print(torch.dot(x, y))
print(1.2 * 2.7 + 2.1 * 1.5) # The way .dot functions

tensor(6.3900)
6.390000000000001
