In [None]:
from datasets import load_dataset

dataset = load_dataset("daekeun-ml/naver-news-summarization-ko")

ko_text = "".join(data["train"]["document"])
ko_chars = sorted(list(set((ko_text))))
ko_vocab_size = len(ko_chars)
print("총 글자 수 :", ko_vocab_size)

character_to_ids = {char:i for i, char in enumerate(ko_chars)}
ids_to_character = {i:char for i, char in enumerate(ko_chars)}
token_encode = lambda s:[character_to_ids[c] for c in s]
token_decode = lambda l: "".join([ids_to_character[i] for i in l])
print(token_encode("안녕하세요 함께 인공지능을 공부하게 되어 반가워요."))
print(token_decode(token_encode("안녕하세요 함께 인공지능을 공부하게 되어 반가워요.")))

In [None]:
import torch
import torch.nn as nn
fron torch.nn import functional as F

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        # 임베딩은 각 단어를 고유한 숫자 벡터로 변환하는 역할
        # 텍스트를 숫자로 만듦으로써 컴퓨터가 이해하고 처리할 수 있게 변환
        # nn.Embedding() 의 첫 번째 vocab_length는 총 단어 수를 의미
        # 두 번재 vocab_length는 각 단어를 표현할 벡터의 크기를 나타냄 (= 각 단어에 대해 2701개의 feature로 표현)
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length) # (vocab_length, vocab_length)의 임베딩 생성
    
    def forward(self, x):
        logits = self.embedding_token_table(x)
        
        return logits

model = semiGPT(ko_vocab_size) # ko_vocab_size = 2701
output = model(example_x, example_y) 
print(output.shape) # torch.Size([4, 8, 2701]) # (배치, 블록, 중복되지 않은 총 글자)

In [None]:
# Loss 추가
import torch
import torch.nn as nn
from torch.nn import functional as F

class SemiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)
    
    def forward(self, inputs, targets):
        # 입력에 대한 확률분포 구하기
        logits = self.embedding_token_table(inputs)

        # 입력 확률 분포와 레이블 확률 분포 사이 손실함수 구하기
        loss = F.cross_entropy(logits, targets) # shape 에러 발생!
        return logits, loss

model = SemiGPT(ko_vocab_size)
output, loss = model(example_x, example_y)
print(output)

In [None]:
# shape 추가한 코드
import torch
import torch.nn as nn
from torch.nn import functional as F

class SemiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)
    
    def forward(self, inputs, targets):
        logits = self.embedding_token_table(inputs, inputs)
        
        # shape 통일
        # batch, seq_length, vocab_length = logits.shape
        # logits = logits.view(batch * seq_length, vocab_length)
        logits = logits.view(32, -1)
        
        # batch, seq_length = targets.shape
        # targets = targets.view(batch * seq_length)
        targets = targets.view(32)

        loss = F.cross_entropy(logits, targets)
        return logits, loss

model = SemiGPT(ko_vocab_size)
logits, loss = model(example_x, example_y)
print(loss)

# Generate 메서드

In [None]:
# generate 메서드 추가
import torch
import torch.nn as nn
from torch.nn import functional as F

class SemiGPT(nn.Module):
    def __init__(self, vocab_length):
        self.embedding_token_table(vocab_length, vocab_length)
    
    def forward(self, inputs, targets=None):
        logits = self.embedding_token_table(inputs, inputs)
        logits = logits.view(32, -1)

        targets = targets.view(32)

        loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(inputs)
            logits = logits[:, -1, :]
            print(logits)

            probs = F.softmax(logits, dims=1)
            next_inputs = torch.multinomial(probs, num_samples=1) # 가장 높은 확률로 나온 다음 단어 예측
            inputs = torch.cat((inputs, next_inputs), dim=1) # 기존 inputs과 생성된 new_inputs 추가하여 업뎃
        return inputs

model = SemiGPT(ko_vocab_size)
logits, loss = model(example_x, example_y)
print(loss)

# token_decode는 그저 idx를 매핑된 글자로 바꿔주는 함수
# 여기서 예측 idx 생성은 generate함수가 수행
token_decode(model.generate(torch.zeros((1,1),
                            dtype=torch.long),
                            max_new_tokens=10[0].tolist()))

# Optimizer 추가하기

learning_rate = 1e-2
model = semiGPT(ko_vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

from tqdm.auto import tqdm

batch_size = 32
for steps in tqdm(range(10000)):
    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)
    # 옵티마이저 초기화 
    optimizer.zero_grad(set_to_none=True)
    # 역전파 계산 
    loss.backward()
    # 가중치 업데이트 
    optimizer.step()

print(loss.item())

# 데이터 GPU에 전달하기

device = "cuda" if torch.cuda.is_available() else "cpu"
device

# 기존에 있던 batch_function에 gpu에 올리는 전처리 부분 추가
def batch_function(mode):
    dataset = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[index:index+block_size] for index in idx])
    y = torch.stack([dataset[index+1:index+block_size+1] for index in idx])
    x, y = x.to(device), y.to(device) # .to 를 추가
    return x, y

# 전체 코드

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

batch_size = 32
block_size = 8
max_iteration = 50000
eval_interval = 300
learning_rate = 1e-2
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iteration = 200

def batch_function(mode):
    dataset = train_dataset if mode == "train" else test_dataset
    idx = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[index:index+block_size] for index in idx])
    y = torch.stack([dataset[index+1:index+block_size+1] for index in idx])
    x, y = x.to(device), y.to(device) # .to 를 추가
    return x, y

@torch.no_grad()
def compute_loss_metrics():
    out = {}
    model.eval()
    for mode in ["train", "eval"]:
        losses = torch.zeros(eval_iteration)
        for k in range(eval_iteration):
            inputs, targets = batch_function(mode)
            logits, loss = model(inputs, targets)
            losses[k] = loss.item()
        out[mode] = losses.mean()
    model.train()
    return out

class semiGPT(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding_token_table = nn.Embedding(vocab_length, vocab_length)

    def forward(self, inputs, targets=None):
        logits = self.embedding_token_table(inputs)
        if targets is None:
            loss = None
        else:
            batch, seq_length, vocab_length = logits.shape
            logits = logits.view(batch * seq_length, vocab_length)
            targets = targets.view(batch*seq_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, inputs, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(inputs)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_inputs = torch.multinomial(probs, num_samples=1)
            inputs = torch.cat((inputs, next_inputs), dim=1)
        return inputs

model = semiGPT(ko_vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iteration):
    if step % eval_interval == 0 :
        losses = compute_loss_metrics()
        print(f'step : {step}, train loss : {losses["train"]:.4f}, val loss : {losses["eval"]:.4f}')

    example_x, example_y = batch_function("train")
    logits, loss = model(example_x, example_y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

inputs = torch.zeros((1,1), dtype=torch.long, device=device)
print(token_decode(model.generate(inputs, max_new_tokens=100)[0].tolist()))