# Working with text data

### 목표 : LLM에 필요한 데이터셋 준비, LLM이 처리할 수 있도록 raw text data를 만드는 것

할일
1. 텍스트 토큰화
2. 토큰하된 텍스트의 하위 부분을 토큰 ID로 변환
3. 토큰 ID를 벡터로 인코딩

토큰화란? 입력된 텍스트를 더 작은 단위로 나누는 것
Tiktokenizer를 이용하면 토큰이 어떤 형태로 나눠지는지 한번 볼 수 있음.

https://tiktokenizer.vercel.app/

### 1. 정규식을 이용한 토큰 생성

In [1]:
import os
import urllib.request

# 사용할 텍스트 : the-verdict 단편소설
# github에서 다운로드
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)  

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [4]:
len(raw_text)

20479

In [5]:
import re

text = "I HAD always thought Jack Gisburn rather a cheap genius"
result = re.split(r'(\s)', text)

print(result)

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius']


In [6]:
result = re.split(r'([,.]|\s)', text)

In [7]:
result

['I',
 ' ',
 'HAD',
 ' ',
 'always',
 ' ',
 'thought',
 ' ',
 'Jack',
 ' ',
 'Gisburn',
 ' ',
 'rather',
 ' ',
 'a',
 ' ',
 'cheap',
 ' ',
 'genius']

In [8]:
result = [item for item in result if item.strip()]
print(result)

# 공백문자 제거 (선택사항)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius']


In [9]:
# 정규식을 이용한 토큰화

result = re.split(r'([,.;:?_!"()\']|--|\s)', raw_text)
result = [item.strip() for item in result if item.strip()]
preprocessed = result

In [10]:
len(preprocessed)

4690

### 2. 토큰을 토큰 ID로 변환하기

In [11]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [12]:
# 단어들을 정렬하고 각각의 단어에 정수 숫자를 부여하여 vocabulary 생성

vocab = {token:integer for integer, token in enumerate(all_words)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [13]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab # 단어 -> 정수 매핑
        self.int_to_str = {i:s for s,i in vocab.items()} # 정수 -> 단어 매핑

    def encode(self, text):
        preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)', text) # 정규식을 이용한 토큰화

        preprocessed = [
            item.strip() for item in preprocessed if item.strip() # 공백문자 제거
        ]
        ids = [self.str_to_int[s] for s in preprocessed] #토큰 ID 생성
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s([,.;:?_!"()\'])', r'\1', text)
        return text

In [14]:
tokenizer = SimpleTokenizerV1(vocab)

In [15]:
text = "I HAD always thought Jack Gisburn rather a cheap genius"
ids = tokenizer.encode(text)
print(ids)

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486]


In [16]:
tokenizer.decode(ids)

'I HAD always thought Jack Gisburn rather a cheap genius'

### 3. 특수 토큰을 통한 어휘 확장

In [17]:
# text = "Hello, do you like tea? is this-- a test."

# tokenizer.encode(text)
# #토큰 목록에 Hello가 없기 때문에 오류 발생

In [18]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [19]:
len (vocab.items())
#토큰 추가로 vocab 크기 2 증가

1132

In [20]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [21]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab # 단어 -> 정수 매핑
        self.int_to_str = {i:s for s,i in vocab.items()} # 정수 -> 단어 매핑

    def encode(self, text):
        preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)', text) # 정규식을 이용한 토큰화

        preprocessed = [
            item.strip() for item in preprocessed if item.strip() # 공백문자 제거
        ]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed # 사전에 없는 단어는 <|unk|>로 대체
        ]
        ids = [self.str_to_int[s] for s in preprocessed] #토큰 ID 생성
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s([,.;:?_!"()\'])', r'\1', text)
        return text

In [22]:
text = "Hello, do you like tea? is this-- a test."

tokenizer = SimpleTokenizerV2(vocab)

tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 584, 999, 6, 115, 1131, 7]

In [23]:
tokenizer.decode(tokenizer.encode(text))

# vocab 에 없는 단어는 <|unk|> 토큰으로 대체

'<|unk|>, do you like tea? is this -- a <|unk|>.'

### 바이트 쌍 인코딩

In [24]:
import tiktoken

In [25]:
tiktoken.__version__

'0.12.0'

In [26]:
tokenizer = tiktoken.get_encoding("gpt2")

In [27]:
tokenizer.encode("Hello World!")

[15496, 2159, 0]

In [28]:
tokenizer.decode(tokenizer.encode("Hello World!"))

'Hello World!'

In [29]:
# text = ("Hello, do you like tea? <|endoftext|> is this-- a test.")

# tokenizer.encode(text)

# # toknizer 에 <|endoftext|>이 없기떄문에 오류가 발생

In [30]:
text = ("Hello, do you like tea? <|endoftext|> is this-- a test.")

tokenizer.encode(text, allowed_special={"<|endoftext|>"})

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 318, 428, 438, 257, 1332, 13]

In [31]:
text = ("hello, hmyniameis rlatmdghks")

tokenizer.encode(text, allowed_special={"<|endoftext|>"})

[31373, 11, 289, 1820, 8461, 480, 271, 374, 15460, 9132, 456, 591]

### 4. sliding window를 이용한 데이터 샘플링

In [32]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [33]:
enc_semple = enc_text[:50]

In [34]:
context_size = 4

x = enc_semple[:context_size]
y = enc_semple[1:context_size + 1]

print(f"x: {x}")
print(f"y:     {y}")

x: [40, 367, 2885, 1464]
y:     [367, 2885, 1464, 1807]


In [35]:
for i in range(1, context_size+1):
    context = enc_semple[:i]
    desired = enc_semple[i]

    print(context, "---->", desired)

# LLM이 다음 단어를 예측 하는것을 시각적으로 표현?

[40] ----> 367
[40, 367] ----> 2885
[40, 367, 2885] ----> 1464
[40, 367, 2885, 1464] ----> 1807


In [36]:
import torch

torch.__version__

'2.6.0+cu124'

1. 입력데이터를 가져와 청크를 생성
2. 입력과 타켓을 동시에 생성? 하여 나중에 학습에 활용할 떄 둘다 인풋으로 넣을 수 있도록 함

In [37]:
from torch.utils.data import Dataset, DataLoader
# pytorch 데이터로더 는 배치생성, 셔플, 여러 백그라운드 프로세스를 이용하여 효율적인 작업이 가능케 함

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        #전체 텍스트 토크나이징
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # sliding window를 이용한 청크 생성
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader_v1(txt, batch_size = 4, max_length = 25, stride = 128, shuffle = True, drop_last = True, num_workers = 0):


    #토크나이저 생성
    tokenizer = tiktoken.get_encoding("gpt2")

    # 데이터셋 생성
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # 데이터로더 생성
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

drop_last = True를 사용하는 이유

데이터로더는 batch를 2개 단위로 잘라서 쓰는데, batch size가 2로 나누어지지 않을 경우

마지막 batch가 작아져 학습이 이상하게 되는 경우가 많음 그래서 학습 안정성을 위해 마지막 batch를 사용하지 않는것이다.

In [39]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [50]:
DataLoader = create_dataloader_v1(raw_text,batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(DataLoader)
first_batch = next(data_iter)
print(first_batch)

TypeError: create_dataloader_v1() got an unexpected keyword argument 'max_length'

In [41]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [64]:
from torch.utils.data import Dataset, DataLoader

def create_dataloader_v1 (txt, batch_size = 4, max_length = 25, stride = 128, shuffle = True, drop_last = True, num_workers = 0):


    #토크나이저 생성
    tokenizer = tiktoken.get_encoding("gpt2")

    # 데이터셋 생성
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # 데이터로더 생성
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

stride가 1일떄는 같은 토큰이 계속되서 학습되기 때문에 과적합이 발생할 수 있다.
그래서 stride를 키워 같은 토큰이 학습되지 않게 해야한다

In [60]:
DataLoader = create_dataloader_v1(raw_text,batch_size=1, max_length=4, stride=4, shuffle=False)

data_iter = iter(DataLoader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [61]:
data_iter = iter(DataLoader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [62]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]


In [66]:
from torch.utils.data import Dataset, DataLoader

def create_dataloader_v1 (txt, batch_size = 4, max_length = 25, stride = 128, shuffle = True, drop_last = True, num_workers = 0):


    #토크나이저 생성
    tokenizer = tiktoken.get_encoding("gpt2")

    # 데이터셋 생성
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # 데이터로더 생성
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [67]:
DataLoader = create_dataloader_v1(raw_text,batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(DataLoader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("Targets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### 토큰 임베딩 생성

입력 텍스트를 토큰화된 텍스트로 변환하고, 토큰화된 텍스트를 토큰 ID로 변환하였다.   
이제 토큰 ID를 가져와서 임베딩 벡터로 변환시킬 것 이다.

In [68]:
inputs_ids = torch.tensor([2, 3, 5, 1])

In [None]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035],
        [-0.5880,  0.3486,  0.6603],
        [-0.2196, -0.3792,  0.7671],
        ...,
        [-0.5931,  1.0895, -0.6854],
        [ 0.7447,  0.5803, -0.4246],
        [-0.3130,  0.7558, -1.2656]], requires_grad=True)


In [72]:
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [73]:
embedding_layer(inputs_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

In [76]:
embedding_layer = torch.nn.Embedding(tokenizer.n_vocab, output_dim)

In [78]:
tokenizer.n_vocab

50257

In [77]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.1583,  1.0919,  0.4989],
        [-2.0425,  0.4938,  0.6984],
        [ 1.0124, -0.6895, -0.8435],
        ...,
        [ 0.1982,  1.9860,  0.1049],
        [ 0.5026, -0.7659, -2.8447],
        [ 1.9569,  0.2360,  0.0608]], requires_grad=True)


### Encoding word positions

In [83]:
vocab_size = 50257
output_dim = 253

torch.manual_seed(123)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [84]:
from torch.utils.data import Dataset, DataLoader

def create_dataloader_v1 (txt, batch_size = 4, max_length = 25, stride = 128, shuffle = True, drop_last = True, num_workers = 0):


    #토크나이저 생성
    tokenizer = tiktoken.get_encoding("gpt2")

    # 데이터셋 생성
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # 데이터로더 생성
    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [85]:
max_length = 4
DataLoader = create_dataloader_v1(raw_text,batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

data_iter = iter(DataLoader)
inputs, targets = next(data_iter)

print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [87]:
token_embedding = token_embedding_layer(inputs)
token_embedding.shape

torch.Size([8, 4, 253])

In [91]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [92]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [94]:
pos_embedding_layer.weight

Parameter containing:
tensor([[ 0.1426, -0.5349, -0.5544,  ...,  0.7296,  0.2534,  0.0285],
        [-0.0070, -0.5922, -0.1508,  ...,  0.0977, -1.1068, -0.4189],
        [-0.2690,  0.9343,  0.3676,  ..., -0.4828,  0.4321,  1.8220],
        [ 0.2415, -0.3305,  0.4740,  ...,  0.5371,  0.3306,  0.7690]],
       requires_grad=True)

In [95]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 253])


In [96]:
token_embedding.shape

torch.Size([8, 4, 253])

In [98]:
token_embedding[0] + pos_embeddings

tensor([[-0.2120,  0.3492, -0.4310,  ...,  3.6023,  1.1772, -1.0260],
        [-0.4825,  0.0579,  0.1281,  ..., -0.2010, -1.1747, -1.9584],
        [-0.4769, -1.1054, -0.4435,  ...,  0.0188, -0.3323,  2.0604],
        [ 0.3442,  0.4545,  1.6623,  ...,  0.5838,  1.6635,  2.3226]],
       grad_fn=<AddBackward0>)

In [99]:
token_embedding + pos_embeddings

tensor([[[-0.2120,  0.3492, -0.4310,  ...,  3.6023,  1.1772, -1.0260],
         [-0.4825,  0.0579,  0.1281,  ..., -0.2010, -1.1747, -1.9584],
         [-0.4769, -1.1054, -0.4435,  ...,  0.0188, -0.3323,  2.0604],
         [ 0.3442,  0.4545,  1.6623,  ...,  0.5838,  1.6635,  2.3226]],

        [[ 0.4133, -2.9976, -1.2330,  ...,  2.4584, -0.2497,  0.2093],
         [ 1.6403,  0.7245, -0.0144,  ...,  0.8824, -1.6190, -0.7955],
         [ 0.2591,  0.5485,  0.6395,  ...,  0.8905,  1.9007,  1.7901],
         [-0.2560,  1.5232, -0.3047,  ...,  0.1537, -0.7325,  0.1415]],

        [[ 0.2131,  0.8953,  0.1293,  ..., -0.0073,  0.3102, -0.4645],
         [-1.7381, -0.8701,  0.4405,  ...,  0.7112, -2.1715, -0.2707],
         [-1.0213,  0.6994,  0.2851,  ..., -1.6813,  1.5077,  0.8848],
         [ 2.0217, -0.6238, -1.5472,  ...,  0.3562,  1.4300,  1.6656]],

        ...,

        [[ 1.7065, -0.6231, -1.3246,  ...,  1.1514,  0.9423, -1.1236],
         [-2.6719, -0.9388, -1.2939,  ...,  0.9628, -2.73

In [97]:
input_embedding = token_embedding + pos_embeddings
print(input_embedding.shape)

torch.Size([8, 4, 253])
