'''
Steps preparing a dataset for a GPT like model

1) Load a free text from internet, usually they are pdf documents which can be converted to txt to ease processing
2) Tokenize the text, depending on your application you can choose to keep or leave certain separators, to lowercase or not
   your data, etc.., alternatives depending on the model exists in Hugging Face, Spacy etc.., for GPT like models tiktoken library is used
   to provide BPE (Byte-Pair Encoding) tokenization.
3) Create a dataloader in the library you choose, Keras NLP or Pytorch, include the tokenizer step
4) Create token embeddings including the position embeddings
'''

In [1]:
'''
Text Input Source: https://en.wikisource.org/wiki/The_Verdict

I have downloaded the text in pdf, the first step is to extract the text from it

'''

'\nText Input Source: https://en.wikisource.org/wiki/The_Verdict\n\nI have downloaded the text in pdf, the first step is to extract the text from it\n\n'

In [2]:
import pandas as pd
import glob, os, sys
from PyPDF2 import PdfReader
from tqdm import tqdm

# Directorio donde se encuentran los PDFs
pdf_directory = os.getcwd()
pdf_file = 'The_Verdict.pdf'
paginas_a_extraer = list(range(1,18))

with open(os.path.join(pdf_directory, pdf_file), "rb") as file:
    reader = PdfReader(file)
   
    text = ""
    for pagina in paginas_a_extraer: 
        if pagina < len(reader.pages): 
            pagina_pdf = reader.pages[pagina] 
            text += pagina_pdf.extract_text()
print(text[1:])

txt_path = os.path.join(pdf_directory, 'The_Verdict.txt')
# Escribir el texto extraído a un fichero .txt 
with open(txt_path, "w", encoding="utf-8") as txt_file:
      txt_file.write(text[1:])

I HAD always thought Jack Gisburn rather a cheap genius--
though a good fellow enough--so it was no great surprise to
me to hear that, in the height of his glory , he had dropped
his painting, married a rich widow , and established himself
in a villa on the Riviera. (Though I rather thought it would
have been Rome or Florence.)
"The height of his glory"--that was what the women called
it. I can hear Mrs. Gideon Thwing--his last Chicago sitter --
deploring his unaccountable abdication. "Of course it's
going to send the value of my picture 'way up; but I don't
think of that, Mr . Rickham--the loss to Arrt is all I think of."
The word, on Mrs. Thwing's lips, multiplied its _rs_ as
though they were reflected in an endless vista of mirrors.
And it was not only the Mrs. Thwings who mourned. Had
not the exquisite  Hermia Croft, at the last Grafton Gallery
show , stopped me before Gisburn's "Moon-dancers" to say,
with tears in her eyes: "We shall not look upon its like
again"?
Well!--even thro

In [3]:
with open("The-Verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:150])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of


In [4]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [5]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [6]:
vocab = {token:integer for integer,token in enumerate(all_words)}
print(len(vocab))

1130


In [7]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [8]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [9]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [10]:
simple_tokenizer = SimpleTokenizerV2(vocab)
print(simple_tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [11]:
print(simple_tokenizer.decode(simple_tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [12]:

import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tokenizer= tiktoken.get_encoding('gpt2')

tiktoken version: 0.8.0


In [13]:
text = (
"Hello, do you like tea? <|endoftext|> In the sunlit terraces"
"of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In fact, the BPE tokenizer, which was used to train models such as GPT-2, GPT-3, and
the original model used in ChatGPT, has a total vocabulary size of 50,257, with
<|endoftext|> being assigned the largest token ID.

In [14]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [15]:
import numpy as np
np.max(enc_text)

50085

In [16]:
# Implement input-target pairs from the training dataset using Dataset and Dataloader classes
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        # Create sliding windows
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            #convert to tensors
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size = 4, max_length = 256, 
                         stride = 128, shuffle = True, drop_last = True, 
                         num_workers = 0):
    
    tokenizer= tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [17]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length,shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [18]:
#  Create Token Embeddings including positional embedding, which can be absolute or relative, GPT uses absolute
vocab_size = 50247
output_dim = 256

torch.manual_seed(123)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(token_embedding_layer.weight)
print(token_embedding_layer.weight.shape)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ...,  1.3337,  0.0771, -0.0522],
        [ 0.2386,  0.1411, -1.3354,  ..., -0.0315, -1.0640,  0.9417],
        [-1.3152, -0.0677, -0.1350,  ..., -0.3181, -1.3936,  0.5226],
        ...,
        [-1.1053,  0.5125,  0.4518,  ...,  1.9856, -2.5345, -1.1502],
        [-1.0242, -0.9165,  0.8011,  ...,  0.8245,  0.6877, -0.6184],
        [ 0.9898,  0.5546,  0.5102,  ...,  0.4626, -0.0853, -1.4430]],
       requires_grad=True)
torch.Size([50247, 256])


In [19]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [20]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [21]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [22]:
# An example from first embedding dimension to be used in Ch. 2
input_embeddings[:,:,0]

tensor([[ 1.4117,  0.9599,  1.8852,  2.5063],
        [ 1.3674,  1.0696,  0.3401,  1.3563],
        [ 1.5433,  0.0562,  1.0025,  1.7163],
        [ 2.8333,  4.8586,  0.3163,  1.8257],
        [ 1.1971,  0.2007,  2.7901,  5.2530],
        [ 1.8696,  2.4908,  1.9542,  2.2539],
        [ 0.1828, -0.1302, -1.8858,  0.5482],
        [-1.2937,  1.7937,  0.4240,  3.5123]], grad_fn=<SelectBackward0>)