In [1]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader



In [3]:


class Seq2SeqDataset(Dataset):
    def __init__(self, data):
        """
        data = [(input_ids, target_ids), ...]
        input_ids, target_ids = LongTensor (seq_len,)
        """
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [4]:

def collate_fn(batch, x_pad_idx=0, y_pad_idx=0, device="cpu"):
    # batch = [(x, y), (x, y), ...]
    x_seqs, y_seqs = zip(*batch)

    # ① 각 문장 길이 저장
    input_lens  = [len(seq) for seq in x_seqs]
    target_lens = [len(seq) for seq in y_seqs]
    
    # ② pad_sequence 적용 (batch_first=True)
    x_padded = pad_sequence(
        x_seqs,
        batch_first=True,
        padding_value=x_pad_idx
    ).to(device)

    y_padded = pad_sequence(
        y_seqs,
        batch_first=True,
        padding_value=y_pad_idx
    ).to(device)

    return x_padded, y_padded, input_lens, target_lens

In [5]:
def get_dataloader(train_data, batch_size, device="cpu", x_pad_idx=0, y_pad_idx=0):
    dataset = Seq2SeqDataset(train_data)

    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=lambda batch: collate_fn(
            batch,
            x_pad_idx=x_pad_idx,
            y_pad_idx=y_pad_idx,
            device=device
        )
    )

In [6]:
device = 'cuda' if torch.cuda.is_available() else "cpu"
train_data = ''
train_loader = get_dataloader(train_data, batch_size=32, device=device)

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [8]:
import unicodedata
import re

RE_PUNCT = re.compile(r"([,.!?])")
RE_NON_ASCII = re.compile(r"[^a-zA-Z,.!?]+")
RE_MULTI_SPACE = re.compile(r"\s+")


def unicode_to_ascii_new(s:str) -> str:
    normalized = unicodedata.normalize("NFD", s)
    return "".join(c for c in normalized if unicodedata.category(c) != "Mn")

def normalize_text_new(text: str) -> str:
    text = unicode_to_ascii_new(text.lower().strip())
    text = RE_PUNCT.sub(r" \1", text)
    text = RE_NON_ASCII.sub(r" ", text)
    text = RE_MULTI_SPACE.sub(r" ", text).strip()
    return text

In [9]:
import io

def load_parallel_corpus(path: str, limit: int = 30000):
    corpus = []
    with io.open(path, "r", encoding="utf-8")as f:
        for i, line in enumerate(f):
            if i >= limit:
                break
            corpus.append(line.strip())
    return corpus

corpus = load_parallel_corpus("NLP/fra.txt", limit=30000)
print("Loaded", len(corpus))

Loaded 30000


In [None]:
MIN_LENGTH = 3
MAX_LENGTH = 25

def is_valid_length(tokens, min_len=3, max_len=25):
    return min_len <= len(tokens) <= max_len

X_r = []
y_r = []

for line in corpus:
    line = line.strip()
    if not line:
        continue
    
    # 탭으로 안전하게 분리 (3개 이하이면 skip)
    parts = line.split('\t')
    if len(parts) < 2:
        continue
    
    src, tgt = parts[0], parts[1]

    # 정규화 + 토큰화
    src_tokens = normalize_text_new(src).split()
    tgt_tokens = normalize_text_new(tgt).split()

    # 길이 필터 적용
    if is_valid_length(src_tokens, MIN_LENGTH, MAX_LENGTH) and \
       is_valid_length(tgt_tokens, MIN_LENGTH, MAX_LENGTH):
        X_r.append(src_tokens)
        y_r.append(tgt_tokens)

print(len(X_r), len(y_r))
print(X_r[0], y_r[0])


29427 29427
['i', 'see', '.'] ['je', 'comprends', '.']


In [11]:
from collections import Counter

def build_vocab(token_lists, min_freq=1, specials=None):
    counter = Counter()
    for tokens in token_lists:
        counter.update(tokens)

    
    vocab= []
    for token, freq in counter.most_common():
        if freq >= min_freq:
            vocab.append(token)

    
    if specials is None:
        specials = ["<PAD>", "UNK", "<s>", "</s>"]
    vocab = specials + vocab

    stoi = {word: idx for idx, word in enumerate(vocab)}
    itos = {idx: word for idx, word in enumerate(vocab)}

    return vocab, stoi, itos   

In [12]:
source_vocab, source2index, index2source = build_vocab(
    X_r,
    min_freq=1,
    specials=["<PAD>", "UNK", "<s>", "</s>"]
    )

target_vocab, target2index, index2target = build_vocab(
    y_r,
    min_freq=1,
    specials=["<PAD>", "UNK", "<s>", "</s>"]
)



In [None]:
def tokens_to_indices(tokens, vocab, unk_token='<UNK>'):
    return torch.tensor(
        [vocab[token] if token in vocab else vocab[unk_token]
         for token in tokens],
         dtype=torch.long
    )


def prepare_example(src_tokens, tgt_tokens, src_vocab, tgt_vocab):
    src_ids = tokens_to_indices(src_tokens + ["</s>"], src_vocab)
    tgt_ids = tokens_to_indices(tgt_tokens + ["</s>"], tgt_vocab)
    return src_ids, tgt_ids

# modern way: 전체 데이터 전처리
dataset = [prepare_example(so, ta, source2index, target2index) for so, ta in zip(X_r, y_r)]

In [14]:
device = "cpu"
dataloader = get_dataloader(
    train_data=dataset,   # 네가 만든 dataset 리스트
    batch_size=32,
    device=device,
    x_pad_idx=source2index["<PAD>"],
    y_pad_idx=target2index["<PAD>"]
)

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class Encoder(nn.Module):
    def __init__(
        self,
        input_size: int,       # vocab size
        embedding_size: int,
        hidden_size: int,
        n_layers: int = 1,
        bidirectional: bool = False,
        dropout: float = 0.0,
        device: str = "cpu",
    ):
        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.n_directions = 2 if bidirectional else 1
        self.device = device

        self.embedding = nn.Embedding(input_size, embedding_size)

        self.gru = nn.GRU(
            input_size=embedding_size,
            hidden_size=hidden_size,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0.0,
        )

        self._init_weights()

    def _init_weights(self):
        # Xavier 초기화
        nn.init.xavier_uniform_(self.embedding.weight)

        for name, param in self.gru.named_parameters():
            if "weight_ih" in name or "weight_hh" in name:
                nn.init.xavier_uniform_(param)
            elif "bias" in name:
                nn.init.zeros_(param)

    def init_hidden(self, batch_size: int):
        # (num_layers * num_directions, batch, hidden_size)
        return torch.zeros(
            self.n_layers * self.n_directions,
            batch_size,
            self.hidden_size,
            device=self.device,
        )

    def forward(self, inputs, input_lengths):
        """
        inputs: (batch, seq_len)  - token indices
        input_lengths: list[int]  - 각 시퀀스 실제 길이
        """
        batch_size = inputs.size(0)
        hidden = self.init_hidden(batch_size)

        embedded = self.embedding(inputs)  # (batch, seq_len, embed_dim)

        # 길이 기준으로 pack (정렬 안 되어 있어도 enforce_sorted=False로 처리)
        packed = pack_padded_sequence(
            embedded,
            input_lengths,
            batch_first=True,
            enforce_sorted=False,
        )

        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = pad_packed_sequence(outputs, batch_first=True)
        # outputs: (batch, seq_len, hidden_size * n_directions)

        # hidden: (num_layers * n_directions, batch, hidden_size)
        # → 마지막 레이어만 사용
        if self.bidirectional:
            # (num_layers, n_directions, batch, hidden)
            hidden = hidden.view(self.n_layers, self.n_directions, batch_size, self.hidden_size)
            # 마지막 레이어: (n_directions, batch, hidden)
            last_layer_hidden = hidden[-1]  # (2, batch, hidden)
            # forward/backward concat: (batch, hidden*2)
            last_layer_hidden = torch.cat(
                [last_layer_hidden[0], last_layer_hidden[1]], dim=-1
            )
            # 디코더 초기 hidden으로 쓰기 쉽게 (1, batch, hidden*2) 형태로 리턴
            dec_init_hidden = last_layer_hidden.unsqueeze(0)
        else:
            # unidirectional: hidden shape (num_layers, batch, hidden)
            hidden = hidden.view(self.n_layers, batch_size, self.hidden_size)
            last_layer_hidden = hidden[-1]  # (batch, hidden)
            dec_init_hidden = last_layer_hidden.unsqueeze(0)  # (1, batch, hidden)

        # outputs는 attention 쓸 때 전체 타임스텝마다 필요
        return outputs, dec_init_hidden


In [None]:
BATCH_SIZE = 64
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 512

encoder = Encoder(
    input_size=len(source2index),
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    n_layers=3,
    bidirectional=True,
    device=device,
).to(device)

for x_batch, y_batch, x_len, y_len in dataloader:
    enc_outputs, enc_hidden = encoder(x_batch, x_len)
    print(enc_outputs.shape)  # (batch, seq_len, hidden*2)
    print(enc_hidden.shape)   # (1, batch, hidden*2)
    break


torch.Size([32, 7, 1024])
torch.Size([1, 32, 1024])


In [None]:
class AttnDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, device="cpu", dropout=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.device = device

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(embedding_size, hidden_size, batch_first=True)

        # Luong-style attention: score(h_t, h_s) = h_s^T W h_t
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, input_token, prev_hidden, encoder_outputs):
        """
        input_token: (B,) 현재 step에서 넣을 토큰 인덱스
        prev_hidden: (1,B,H) 이전 디코더 hidden (혹은 encoder에서 온 hidden)
        encoder_outputs: (B,T,H) 인코더 전체 출력
        """
        embedded = self.embedding(input_token).unsqueeze(1)  # (B,1,E)
        embedded = self.dropout(embedded)

        output, hidden = self.gru(embedded, prev_hidden)     # output: (B,1,H)

        # Attention
        # 1) encoder_outputs -> attn → (B,T,H)
        energies = self.attn(encoder_outputs)                # (B,T,H)
        # 2) energies · output^T
        attn_scores = torch.bmm(energies, output.transpose(1,2)).squeeze(2)  # (B,T)
        attn_weights = F.softmax(attn_scores, dim=1).unsqueeze(1)            # (B,1,T)
        context = torch.bmm(attn_weights, encoder_outputs)                   # (B,1,H)

        # 3) [output, context] concat 후 vocab score
        concat = torch.cat([output, context], dim=-1)        # (B,1,2H)
        logits = self.out(concat.squeeze(1))                 # (B,vocab)

        return logits, hidden, attn_weights


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cpu")

EPOCHS = 5
BATCH_SIZE = 256
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 512
LR = 1e-3
DECODER_LR_RATIO = 5.0
TEACHER_FORCING_RATIO = 0.5  # 예: 절반은 정답, 절반은 모델 예측을 입력으로 사용

# 1) 모델 생성
encoder = Encoder(
    input_size = len(source2index),
    embedding_size= EMBEDDING_SIZE,
    hidden_size = HIDDEN_SIZE,
    n_layers=3,
    bidirectional=True,
    device=device
    ).to(device)

decoder = AttnDecoder(
    input_size = len(target2index),
    embedding_size=EMBEDDING_SIZE,
    hidden_size = HIDDEN_SIZE * 2,
    device= device
).to(device)

# 2) 손실 함수 & 옵티마이저
PAD_IDX = target2index["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

enc_optimizer = optim.Adam(encoder.parameters(), lr=LR)
dec_optimizer = optim.Adam(decoder.parameters(), lr=LR * DECODER_LR_RATIO)

for epoch in range(EPOCHS):
    encoder.train()
    decoder.train()

    epoch_losses = []

    for x_batch, y_batch, x_len, y_len in dataloader:
        # x_batch: (B, src_T)
        # y_batch: (B, tgt_T)
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        # 1) Encoder
        enc_outputs, enc_hidden = encoder(x_batch, x_len)
        # enc_outputs: (B, src_T, H_enc)
        # enc_hidden:  (1, B, H_dec_init)  # 이미 concat 된 상태라고 가정

        # 2) Decoder 입력/타겟 준비
        #   y_batch = [<s>, y1, y2, ..., yN, </s>]
        trg_input  = y_batch[:, :-1]  # B, T-1  (디코더 입력)
        trg_target = y_batch[:, 1:]   # B, T-1  (정답 라벨)

        batch_size, max_len = trg_input.size()
        vocab_size = len(target2index)

        # 디코더 출력 저장 텐서
        all_logits = torch.zeros(batch_size, max_len, vocab_size, device=device)

        # 초기 디코더 hidden = encoder에서 받은 hidden
        dec_hidden = enc_hidden  # (1,B,H_dec)

        # 첫 입력 토큰: <s>
        input_token = trg_input[:, 0]  # B,

        for t in range(max_len):
            # 3) 현재 step 디코더 한 번 실행
            logits, dec_hidden, attn_weights = decoder(
                input_token,      # B,
                dec_hidden,       # (1,B,H)
                enc_outputs       # (B,src_T,H)
            )
            # logits: (B,vocab_size)

            all_logits[:, t, :] = logits

            # 4) teacher forcing 적용 여부
            use_teacher = (torch.rand(1).item() < TEACHER_FORCING_RATIO)

            if t+1 < max_len:  # 마지막 step 이후에는 필요 없음
                if use_teacher:
                    # 정답을 다음 step 입력으로
                    input_token = trg_input[:, t+1]
                else:
                    # 모델이 방금 낸 예측을 다음 입력으로
                    input_token = logits.argmax(dim=-1)

        # 5) Loss 계산
        # all_logits: (B, T-1, V) → (B*(T-1), V)
        # trg_target: (B, T-1)    → (B*(T-1))
        loss = criterion(
            all_logits.view(batch_size * max_len, vocab_size),
            trg_target.reshape(-1)
        )

        # 6) 역전파 + 업데이트
        enc_optimizer.zero_grad()
        dec_optimizer.zero_grad()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 50.0)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 50.0)

        enc_optimizer.step()
        dec_optimizer.step()

        epoch_losses.append(loss.item())

    print(f"[{epoch+1}/{EPOCHS}] mean_loss: {sum(epoch_losses)/len(epoch_losses):.4f}")


KeyboardInterrupt: 