In [None]:
import re
import torch
import torch.nn as nn
import numpy as np
import math
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
from dataclasses import dataclass
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@dataclass(frozen=True)
class Config:
    text_path = 'dataset/sherlock-holm.es_stories_plain-text_advs.txt'
    max_length = 128
    stride = 4
    batch_size = 16

cfg = Config()

In [3]:
# 전처리 및 토크나이즈
with open(cfg.text_path, 'r', encoding='utf-8') as f:
    raw_text = f.read()

# 줄 앞뒤 공백 제거 및 연속된 빈 줄 1개로 정리
raw_text = re.sub(r'\s+\n', '\n', raw_text)
raw_text = re.sub(r'^\s+', '', raw_text, flags=re.MULTILINE)
raw_text = re.sub(r'\n{2,}', '\n\n', raw_text)

# 제목 삭제
raw_text = re.sub(r'THE ADVEN.*\n', '', raw_text, flags=re.MULTILINE)

# 목차나 저자 등 메타데이터 제거
raw_text = re.sub(r'Arthur Conan Doyle', '', raw_text, flags=re.IGNORECASE)
raw_text = re.sub(r'\bTable of contents\b.*?(?=CHAPTER I)', '', raw_text, flags=re.DOTALL|re.IGNORECASE)
raw_text = re.sub(r'chapter ([0~9]|[ivx])', '', raw_text, flags=re.IGNORECASE)
raw_text = re.sub(r'----.*', '', raw_text, flags=re.DOTALL)


# 특수문자, 과도한 빈칸 정리
raw_text = re.sub(r'[“”]', '"', raw_text)
raw_text = re.sub(r"[‘’]", "'", raw_text)
raw_text = re.sub(r' +', ' ', raw_text)

# 앞뒤 전체 공백 제거
raw_text = raw_text.strip()

# 전처리된 텍스트 결과
preprocessed_text = raw_text
print(f"preprocessed text length: {len(preprocessed_text)}, words: {len(preprocessed_text.split())}")

# 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# 토크나이즈
tokenized_text = tokenizer.encode(preprocessed_text)
print(len(tokenized_text))

preprocessed text length: 558453, words: 104321


Token indices sequence length is longer than the specified maximum sequence length for this model (133741 > 512). Running this sequence through the model will result in indexing errors


133741


In [4]:
class MyDataset(Dataset) :
    def __init__(self, tokenized_text, cfg) :
        self.tokenized_text = tokenized_text
        self.cfg = cfg
        
        self.input_ids = []
        self.labels = []
        
        for i in range(0, len(self.tokenized_text) - self.cfg.max_length, self.cfg.stride) :
            self.input_ids.append(self.tokenized_text[i:i+self.cfg.max_length])
            self.labels.append(self.tokenized_text[i+1:i+self.cfg.max_length+1])
        
        self.input_ids = torch.tensor(self.input_ids)
        self.labels = torch.tensor(self.labels)
        
    def __len__(self) :
        return len(self.input_ids)
    
    def __getitem__(self, idx) :
        return self.input_ids[idx], self.labels[idx]

모델 참조 : https://www.manning.com/books/build-a-large-language-model-from-scratch

In [5]:
VOCAB_SIZE = len(tokenizer.vocab)
print(VOCAB_SIZE)
#VOCAB_SIZE = len(tokenizer) # AutoTokenizer
L = 128  # Shortened context length (orig: 1024)
E = 768  # Embedding dimension
H = 12  # Number of attention heads
NUM_LAYERS = 12  # Number of layers
DROP_RATE = 0.1  # Dropout rate

30522


In [6]:
class MultiheadAttention(nn.Module) :
    def __init__(self, din, dout) :
        super().__init__()
        
        assert dout % H == 0, "dout must be devided by NUM_HEADS"
        
        self.din = din
        self.dout = dout
        self.D = dout // H
        
        self.W_Q = nn.Linear(din, dout)
        self.W_K = nn.Linear(din, dout)
        self.W_V = nn.Linear(din, dout)
        self.out_proj = nn.Linear(dout, dout)
        self.dropout = nn.Dropout(p=DROP_RATE)
        self.register_buffer('mask', torch.triu(torch.ones(L, L), diagonal=1)) # Causal Attn을 위한 Mask
        
    def forward(self, x) :
        B, L, E = x.shape
        D = self.D
        
        # Query, Key, Value Matrix 생성 (B, L, E)
        Q = self.W_Q(x)
        K = self.W_K(x)
        V = self.W_V(x)
        
        # Multihead 반영 (B, L, E) -> (B, L, H, D) -> (B, H, L, D)
        Q = Q.view(B, L, H, D).transpose(1, 2)
        K = K.view(B, L, H, D).transpose(1, 2)
        V = V.view(B, L, H ,D).transpose(1, 2)
        
        # Attention Score 구하기
        attn_scores = Q @ K.transpose(2, 3) # (B, H, L, L)
        attn_scores.masked_fill(self.mask[:L, :L] == 1, float('-inf'))
        attn_scores = attn_scores / (D ** 0.5)
        
        # Attention Weight 구하기
        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        # Context Vector 구하기
        CV = attn_weights @ V # (B, H, L, D)
        CV = CV.reshape(B, L, E)
        CV = self.out_proj(CV)
        
        return CV
    
class LayerNorm(nn.Module) :
    def __init__(self, dim, eps=1e-5) :
        super().__init__()
        
        # 파라미터 생성
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(dim))
        self.beta = nn.Parameter(torch.zeros(dim))
        
    def forward(self, x) :
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        x = (x - mean) / torch.sqrt(var + self.eps) * self.gamma + self.beta
        return x

class FeedForward(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(E, 4*E),
            nn.GELU(),
            nn.Linear(4*E, E),
        )
    
    def forward(self, x) :
        x = self.layers(x)
        return x
    
class TransformerBlock(nn.Module) :
    def __init__(self):
        super().__init__()
        self.attn = MultiheadAttention(E, E)
        self.norm = LayerNorm(E)
        self.ff = FeedForward()
        self.dropout = nn.Dropout(DROP_RATE)
        
    def forward(self, x):
        residual = x
        x = self.norm(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + residual
        
        residual = x
        x = self.norm(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + residual
        
        return x
    
class SimpleLLM(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.token_embedding = nn.Embedding(VOCAB_SIZE, E)
        self.positional_embedding = nn.Embedding(L, E)
        self.dropout_embedding = nn.Dropout(DROP_RATE)
        
        self.blocks = nn.Sequential(
            *[TransformerBlock() for _ in range(NUM_LAYERS)]
        )
        
        self.last_norm = LayerNorm(E)
        self.out = nn.Linear(E, VOCAB_SIZE, bias=False)
        
    def forward(self, input_ids) :
        B, L = input_ids.shape
        tok_emb = self.token_embedding(input_ids)
        pos_emb = self.positional_embedding(torch.arange(L, device=input_ids.device))
        x = tok_emb + pos_emb
        x = self.dropout_embedding(x)
        x = self.blocks(x)
        x = self.last_norm(x)
        logits = self.out(x)
        return logits

In [None]:
# 모델 학습

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

torch.manual_seed(42)
model = SimpleLLM().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=4e-4, weight_decay=0.1)
train_dataset = MyDataset(tokenized_text, cfg)
train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True)

num_epochs = 30

epoch_losses = []
for epoch in range(num_epochs):
    model.train()
    pbar = tqdm(train_loader, desc=f"[Epoch : {epoch + 1}]")
    
    epoch_loss = 0
    for input_ids, labels in pbar:
        optimizer.zero_grad()
        input_ids, labels = input_ids.to(device), labels.to(device) # (B, L)
        
        logits = model(input_ids) # (B, L, V)
        loss = F.cross_entropy(logits.flatten(0, 1), labels.flatten())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        pbar.set_postfix(loss=loss.item())
    
    avg_loss = epoch_loss / len(pbar)
    epoch_losses.append(avg_loss)
    print(f"Avg Loss : {avg_loss}")
    torch.save(model.state_dict(), f"model/simplellm_epoch{epoch+1}.pth")

cuda


[Epoch : 1]:  40%|███▉      | 831/2088 [02:46<04:11,  5.00it/s, loss=0.111] 


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\fnf00\AppData\Roaming\Python\Python313\site-packages\IPython\core\interactiveshell.py", line 3549, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
    ~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\fnf00\AppData\Local\Temp\ipykernel_20248\2669281967.py", line 31, in <module>
    pbar.set_postfix(loss=loss.item())
                          ~~~~~~~~~^^
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\fnf00\AppData\Roaming\Python\Python313\site-packages\IPython\core\interactiveshell.py", line 2173, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
        etype, value, tb, tb_offset=tb_offset
    )
  File "C:\Users\fnf00\AppData\Roaming\Python\Python313\site-packages\IPython\core\ultratb.py", line 1182, in structured_traceback
    return FormattedTB.structured_traceback(
           ~~~~~~~~~~~

In [None]:
# 학습 결과 출력
plt.plot(epoch_losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.show()

In [None]:
# 문장 생성

# 모델 불러오기
model.load_state_dict(torch.load('asdf.pth', map_location=device, weights_only=True))
model.eval()

text = "It was evident" # 시작 문장

input_ids = tokenizer.encode(text)
input_ids = torch.tensor(input_ids).unsqueeze(0) # (1, L)

generation_len = 10
for _ in range(generation_len) :
    with torch.no_grads() :
        logits = model(input_ids) # (1, L, V)
    logits = logits[:, -1, :] # 마지막 토큰(새로 생성된 토큰만 추출), (1, V) -> 정수 인덱싱 시 차원 삭제됨됨
    
    next_token = torch.argmax(logits, dim=-1, keepdim=True) # (1, 1)
    input_ids = torch.cat((input_ids, next_token), dim=-1)
    
input_ids = input_ids.squeeze(0) # (Generation_Len, )
generated_text = tokenizer.decode(input_ids.tolist().replace("\n", " "))

print(generated_text)