In [1]:
import tiktoken

file_path = './data/Jane-Austen-Pride_and_Prejudice-pg1342.txt'

with open(file_path, "r", encoding='utf-8') as f:
    raw_text = f.read()

print(raw_text[:300])

Title: Pride and Prejudice

Author: Jane Austen

Release date: June 1, 1998 [eBook #1342]
                Most recently updated: October 29, 2024

Language: English

Credits: Chuck Greif and the Online Distributed Proofreading Team at http://www.pgdp.net (This file was produced from images available


In [2]:
tokenizer = tiktoken.get_encoding("gpt2")
encoded_data = tokenizer.encode(raw_text)

encoded_data = encoded_data[100:]
print(len(encoded_data))



196029


In [3]:
context_size = 13
x = encoded_data[:context_size]
y = encoded_data[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [3963, 3336, 21965, 23680, 402, 3843, 1677, 13246, 38, 412, 39453, 4810, 14114]
y:      [3336, 21965, 23680, 402, 3843, 1677, 13246, 38, 412, 39453, 4810, 14114, 5357]


In [4]:
print(tokenizer.decode(x))
print(tokenizer.decode(y))

 OF THE PROJECT GUTENBERG EBOOK PRIDE
 THE PROJECT GUTENBERG EBOOK PRIDE AND


In [5]:
for i in range(1, context_size+1):
    context = encoded_data[:i]
    desired = encoded_data[i]
    print(context, " --------> ", desired)
    

[3963]  -------->  3336
[3963, 3336]  -------->  21965
[3963, 3336, 21965]  -------->  23680
[3963, 3336, 21965, 23680]  -------->  402
[3963, 3336, 21965, 23680, 402]  -------->  3843
[3963, 3336, 21965, 23680, 402, 3843]  -------->  1677
[3963, 3336, 21965, 23680, 402, 3843, 1677]  -------->  13246
[3963, 3336, 21965, 23680, 402, 3843, 1677, 13246]  -------->  38
[3963, 3336, 21965, 23680, 402, 3843, 1677, 13246, 38]  -------->  412
[3963, 3336, 21965, 23680, 402, 3843, 1677, 13246, 38, 412]  -------->  39453
[3963, 3336, 21965, 23680, 402, 3843, 1677, 13246, 38, 412, 39453]  -------->  4810
[3963, 3336, 21965, 23680, 402, 3843, 1677, 13246, 38, 412, 39453, 4810]  -------->  14114
[3963, 3336, 21965, 23680, 402, 3843, 1677, 13246, 38, 412, 39453, 4810, 14114]  -------->  5357


In [6]:
for i in range(1, context_size+1):
    context = encoded_data[:i]
    desired = encoded_data[i]
    print(tokenizer.decode(context), " --------> ", tokenizer.decode([desired]))

 OF  -------->   THE
 OF THE  -------->   PRO
 OF THE PRO  -------->  JECT
 OF THE PROJECT  -------->   G
 OF THE PROJECT G  -------->  UT
 OF THE PROJECT GUT  -------->  EN
 OF THE PROJECT GUTEN  -------->  BER
 OF THE PROJECT GUTENBER  -------->  G
 OF THE PROJECT GUTENBERG  -------->   E
 OF THE PROJECT GUTENBERG E  -------->  BOOK
 OF THE PROJECT GUTENBERG EBOOK  -------->   PR
 OF THE PROJECT GUTENBERG EBOOK PR  -------->  IDE
 OF THE PROJECT GUTENBERG EBOOK PRIDE  -------->   AND


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})    # 전체 텍스트를 tokenize

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    # Basic functions( __len__, __getitem__) to inherit the Dataset class from pytorch
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

        

In [11]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,     #C: drop_last=True로 설정하면, 마지막 배치가 지정된 batch_size보다 작을 경우 이를 버려서 훈련 중 손실 스파이크를 방지함....
        num_workers=num_workers
    )

    return dataloader

In [12]:
# DataLoader Test: batch 1, context 4인 LLM DataLoader Test
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)  #A: DataLoader를 python iterator로 변환하여, python 내장함수로 next() 을 통해, 다음항목 가져옴.......
first_batch = next(data_iter)
print(first_batch)

[tensor([[19160,    25, 21735,   290]]), tensor([[   25, 21735,   290,  3771]])]


In [13]:
# More bigger stride and batch size
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[19160,    25, 21735,   290],
        [ 3771, 10456,   501,   198],
        [  198, 13838,    25, 12091],
        [ 2517,   268,   198,   198],
        [26362,  3128,    25,  2795],
        [  352,    11,  7795,   685],
        [   68, 10482,  1303,  1485],
        [ 3682,    60,   198,   220]])

Targets:
 tensor([[   25, 21735,   290,  3771],
        [10456,   501,   198,   198],
        [13838,    25, 12091,  2517],
        [  268,   198,   198, 26362],
        [ 3128,    25,  2795,   352],
        [   11,  7795,   685,    68],
        [10482,  1303,  1485,  3682],
        [   60,   198,   220,   220]])
