# Working with Data
- downloading data
- loading data
- preprocessing data

In [1]:
import os
import re
import urllib.request
from importlib.metadata import version
from dataclasses import dataclass

try:
    import tiktoken
except ImportError:
    !pip install tiktoken
    import tiktoken

try:
    import torch
except ImportError:
    !pip install torch
    import torch

print("torch version:", torch.__version__)
print("tiktoken version:", version("tiktoken"))

torch version: 2.1.2+cpu
tiktoken version: 0.5.2


In [2]:
from utils import tokenizers, dataloaders

In [3]:
# global parameters
@dataclass
class Params:
    DATA_ROOT = r"./data/"
    BATCH_SIZE=8
    MAX_LENGTH=4
    STRIDE=1
    SHUFFLE=False
    VOCAB_SIZE=50257
    OUTPUT_DIM=256



my_param = Params()

## downloading data

In [4]:
if not os.path.exists(os.path.join(my_param.DATA_ROOT, "the-verdict.txt")):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = os.path.join(my_param.DATA_ROOT, "the-verdict.txt")
    urllib.request.urlretrieve(url, file_path)

## loading data

In [5]:
with open(os.path.join(my_param.DATA_ROOT, "the-verdict.txt"), "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## data preprocessing
### tokenization

- tokenizing version1

In [6]:
# split text
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

# get unique tokens and create vocab dictionary
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab = {token:integer for integer,token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 5:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)


In [7]:
# using vocab to indexing
tokenizer = tokenizers.SimpleTokenizerV1(vocab)

text = """
        "It's the last he painted, you know," 
        Mrs. Gisburn said with pardonable pride.
"""

ids = tokenizer.encode(text)
print(f"encoding the above text >>> {ids}")
tks = tokenizer.decode(ids)
print(f"decoding the encoded idx >>> {tks}")

encoding the above text >>> [1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
decoding the encoded idx >>> " It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


- tokenizing version2

In [8]:
# add special tokens 
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [9]:
tokenizer = tokenizers.SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(f"combined the above two txts >> {text}")
print(f"encoding the txt >>> {tokenizer.encode(text)}")
print(f"then decoding the above index >>> {tokenizer.decode(tokenizer.encode(text))}")

combined the above two txts >> Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
encoding the txt >>> [1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
then decoding the above index >>> <|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


- tokenizing version3: GPT2-BPE

In [10]:
tokenizer = tiktoken.get_encoding("gpt2")

text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(f"encoding text >>> {integers}")

strings = tokenizer.decode(integers)

print(f"decoding the above index >>> {strings}")

encoding text >>> [15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
decoding the above index >>> Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


- tokenizing version3: GPT2-BPE from scatch

In [11]:
orig_tokenizer = tokenizers.get_encoder(model_name="gpt2_model", models_dir=".")

text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = orig_tokenizer.encode(text)

print(f"encoding text >>> {integers}")

strings = orig_tokenizer.decode(integers)

print(f"decoding the above index >>> {strings}")

encoding text >>> [15496, 11, 466, 345, 588, 8887, 30, 1279, 91, 437, 1659, 5239, 91, 29, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
decoding the above index >>> Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


### create dataset and dataloader

In [12]:
dataloader = dataloaders.create_dataloader_v1(
    raw_text, 
    batch_size=my_param.BATCH_SIZE, 
    max_length=my_param.MAX_LENGTH, 
    stride=my_param.STRIDE, 
    shuffle=my_param.SHUFFLE
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(f"first batch >>> {first_batch}")
second_batch = next(data_iter)
print(f"second_batch>>> {second_batch}")


first batch >>> [tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]]), tensor([[  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257],
        [10899,  2138,   257,  7026]])]
second_batch>>> [tensor([[10899,  2138,   257,  7026],
        [ 2138,   257,  7026, 15632],
        [  257,  7026, 15632,   438],
        [ 7026, 15632,   438,  2016],
        [15632,   438,  2016,   257],
        [  438,  2016,   257,   922],
        [ 2016,   257,   922,  5891],
        [  257,   922,  5891,  1576]]), tensor([[ 2138,   257,  7026, 15632],
        [ 

### Embedding
- token embedding
- positional encoding

In [13]:
torch.manual_seed(123)

# loading data
dataloader = dataloaders.create_dataloader_v1(
    raw_text, 
    batch_size=my_param.BATCH_SIZE, 
    max_length=my_param.MAX_LENGTH,
    stride=my_param.MAX_LENGTH, 
    shuffle=my_param.SHUFFLE
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

# create token embedding layer
token_embedding_layer = torch.nn.Embedding(my_param.VOCAB_SIZE, my_param.OUTPUT_DIM)
token_embeddings = token_embedding_layer(inputs)

print(f"token embedding layer weight >>> {token_embedding_layer.weight}")
print(f"after token embedding layer >>> {token_embeddings}")
print(f"token embedding shape >>> {token_embeddings.shape}")

# create positional embedding layer
context_length = my_param.MAX_LENGTH
# GPT-2 uses absolute position embeddings
pos_embedding_layer = torch.nn.Embedding(context_length, my_param.OUTPUT_DIM)
pos_embeddings = pos_embedding_layer(torch.arange(my_param.MAX_LENGTH))
print(f"position embedding shape >>> {pos_embeddings.shape}")

input_embeddings = token_embeddings + pos_embeddings
print(f"position embedding and token embedding shape >>> {input_embeddings.shape}")

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


token embedding layer weight >>> Parameter containing:
tensor([[-0.3035, -0.5880,  0.3486,  ..., -0.0522, -1.0565,  1.1510],
        [-1.3354, -2.9340,  0.1141,  ...,  0.9417, -0.3591,  0.0168],
        [-0.1350, -0.5183,  0.2326,  ...,  0.5226,  0.5430,  1.8613],
        ...,
        [-1.1628,  1.1698,  1.0007,  ...,  0.4479,  0.7890, -0.2578],
        [ 1.1263,  1.2176, -1.4959,  ...,  0.3331,  0.3341, -0.2369],
        [ 0.7203, -0.1080,  1.0014,  ...,  0.3006,  1.4320,  0.1817]],
       requires_grad=True)
after token embedding layer >>> tensor([[[ 0.1070, -0.1428, -0.3014,  ..., -2.3238,  0.0778,  0.6690],
         [ 0.9873,  0.1164, -0.5726,  ...,  0.3223,  1.2060,  0.2207],
         [-1.2633, -0.3237,  0.8158,  ...,  0.1343,  0.8676, -2.5054],
         [-0.6525, -0.5808,  1.1188,  ...,  0.1531, -1.1201, -0.5092]],

        [[-1.2217,  1.8719,  1.4650,  ..., -1.4138,  0.2478,  1.5313],
         [ 1.0885, -0.4509,  0.1388,  ...,  0.7225,  0.1916,  0.1005],
         [ 1.2222, -0.06