In [5]:
import torch
import torch.nn as nn

# RNN Encoder-Decoder with GRU (Gated Recurrent Unit)

In [6]:
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUCell, self).__init__()
        self.hidden_size = hidden_size

        # Update Gate, Reset Gate, Hidden State
        self.W_z = nn.Linear(input_size + hidden_size, hidden_size)
        self.W_r = nn.Linear(input_size + hidden_size, hidden_size)
        self.W_h = nn.Linear(input_size + hidden_size, hidden_size)

    def forward(self, x, h_prev):
        # Concatenate input and previous hidden state
        combined = torch.cat([x, h_prev], dim=1)

        # Update Gate 계산
        z_t = torch.sigmoid(self.W_z(combined))

        # Reset Gate 계산
        r_t = torch.sigmoid(self.W_r(combined))

        # reset된 h_prev와 input을 이용해 h_tilde 계산
        combined_reset = torch.cat([x, r_t * h_prev], dim=1)
        h_tilde = torch.tanh(self.W_h(combined_reset))

        # 최종 hidden state 계산
        h_t = (1 - z_t) * h_prev + z_t * h_tilde
        return h_t

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers=1, dropout=0.1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.gru = GRUCell(emb_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        # src: (batch_size, src_len)
        embedded = self.dropout(self.embedding(src))  # (batch_size, src_len, emb_dim)
        batch_size, src_len, _ = embedded.size()
        hidden = torch.zeros(batch_size, self.gru.hidden_size).to(src.device)
        
        for t in range(src_len):
            hidden = self.gru(embedded[:, t, :], hidden)
        
        return hidden  # (batch_size, hidden_dim)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, dropout=0.1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = GRUCell(emb_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden):
        # input: (batch_size)
        input = input.unsqueeze(1)  # (batch_size, 1)
        embedded = self.dropout(self.embedding(input))  # (batch_size, 1, emb_dim)
        embedded = embedded.squeeze(1)  # (batch_size, emb_dim)
        
        hidden = self.gru(embedded, hidden)  # (batch_size, hidden_dim)
        output = self.fc_out(hidden)  # (batch_size, output_dim)
        
        return output, hidden

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, teacher_forcing_ratio=0.5):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.teacher_forcing_ratio = teacher_forcing_ratio
    
    def forward(self, src, trg):
        # src: (batch_size, src_len)
        # trg: (batch_size, trg_len)
        
        batch_size = src.size(0)
        trg_len = trg.size(1)
        output_dim = self.decoder.fc_out.out_features
        
        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)
        
        hidden = self.encoder(src)  # (batch_size, hidden_dim)
        
        # 첫 번째 디코더 입력은 <sos> 토큰
        input = trg[:, 0]  # (batch_size)
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)  # output: (batch_size, output_dim)
            outputs[:, t, :] = output
            
            teacher_force = torch.rand(1).item() < self.teacher_forcing_ratio
            top1 = output.argmax(1)  # (batch_size)
            input = trg[:, t] if teacher_force else top1
        
        return outputs

In [11]:
# 입력 문장 (예: batch_size=2, src_len=5)
src = torch.tensor([
    [1, 2, 3, 4, 5],
    [1, 2, 0, 0, 0]  # 두 번째 문장은 짧아서 패딩(0) 처리
]).to('cuda' if torch.cuda.is_available() else 'cpu')

# 목표 문장 (예: batch_size=2, trg_len=6)
trg = torch.tensor([
    [1, 6, 7, 8, 9, 2],  # 1은 <sos>, 2는 <eos> 토큰을 의미
    [1, 6, 7, 2, 0, 0]   # 패딩된 문장
]).to('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# 하이퍼파라미터 설정
INPUT_DIM = 10    # 입력 단어 집합 크기 (예: 10개 단어)
OUTPUT_DIM = 10   # 출력 단어 집합 크기 (예: 10개 단어)
EMB_DIM = 8       # 임베딩 차원 크기
HIDDEN_DIM = 16   # 은닉 상태 크기
DROPOUT = 0.1     # 드롭아웃 비율

# GPU/CPU 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Encoder, Decoder, Seq2Seq 모델 초기화
encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, dropout=DROPOUT).to(device)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, dropout=DROPOUT).to(device)

model = Seq2Seq(encoder, decoder, device).to(device)

In [13]:
# 모델을 평가 모드로 설정 (테스트 시에는 dropout 비활성화)
model.eval()

# 모델 실행
with torch.no_grad():
    output = model(src, trg)  # output: (batch_size, trg_len, output_dim)

# 출력 확인 (확률 분포 형태)
print("모델 출력 (확률 분포):")
print(output)

# 각 시점에서 가장 높은 확률을 가지는 단어 인덱스 출력
predicted_tokens = output.argmax(2)  # (batch_size, trg_len)
print("\n예측된 토큰:")
print(predicted_tokens)

모델 출력 (확률 분포):
tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000],
         [-0.0891,  0.2669, -0.2985,  0.1132, -0.2887,  0.0848, -0.0970,
          -0.3222,  0.3882,  0.0369],
         [-0.1549,  0.3043, -0.2245,  0.0191, -0.2597, -0.0256, -0.1229,
          -0.2898,  0.2709,  0.0398],
         [ 0.0161,  0.2831, -0.1423,  0.0825, -0.1603, -0.0221, -0.0956,
          -0.2152,  0.1873,  0.0827],
         [ 0.0819,  0.1447, -0.0613, -0.0765, -0.0176,  0.2205,  0.1917,
          -0.1474,  0.0438,  0.0674],
         [ 0.0399,  0.2139, -0.0889, -0.0297, -0.0090,  0.1188,  0.1485,
          -0.1828,  0.1010,  0.1320]],

        [[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000],
         [ 0.0602,  0.2433, -0.2138,  0.1204, -0.2673,  0.2614, -0.0456,
          -0.1169,  0.2233,  0.1826],
         [-0.0713,  0.2868, -0.1873,  0.0269, -0.2374,  0.0684, -0.0952,
          -0.1577,  0.