## Attention

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

### Attention 가중치 계산

In [10]:
def attention(query, key, value):
    # 1. Attention Score 계산 (Query와 Key의 유사도 측정)
    # - transpose : 마지막 두 차원 교환(전치)
    scores = torch.matmul(query, key.transpose(-2, -1))   
    print("Scores shape:", scores.shape)
    
    # 2. Score 정규화 (Softmax) => 가중치 계산
    attn_weights = F.softmax(scores, dim=-1)   
    print("Attention Weights shape:", attn_weights.shape)
    
    # 3. Context Vector 계산 (가중합)
    context_vector = torch.matmul(attn_weights, value)   
    print("Context Vector shape:", context_vector.shape)
    
    return context_vector

In [11]:
# 토큰화 및 임베딩 결과

# 임의로 토큰화와 임베딩을 흉내낸 예시
vocab ={
    "나는":0,
    "학원에":1,
    "간다":2,
    "<pad>":3
}

vocab_size = len(vocab)
EMBEDDING_DIM = 4 

In [12]:
# 입력 문장

inputs = ["나는", "학원에", "간다"]
input_ids = torch.tensor([[vocab[word] for word in inputs]])   

input_ids

tensor([[0, 1, 2]])

In [13]:
# 1. 임베딩 레이어 생성
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=EMBEDDING_DIM)
inputs_embedded = embedding_layer(input_ids) 

# 2. 선형 변환 -> Query, Key, Value 생성
HIDDEN_DIM = 4
W_query = nn.Linear(EMBEDDING_DIM, HIDDEN_DIM)
W_key = nn.Linear(EMBEDDING_DIM, HIDDEN_DIM)
W_value = nn.Linear(EMBEDDING_DIM, HIDDEN_DIM)

input_query = W_query(inputs_embedded)
input_key = W_key(inputs_embedded)
input_value = W_value(inputs_embedded)

input_query.shape, input_key.shape, input_value.shape

(torch.Size([1, 3, 4]), torch.Size([1, 3, 4]), torch.Size([1, 3, 4]))

In [14]:
context_vector = attention(input_query, input_key, input_value)
context_vector

Scores shape: torch.Size([1, 3, 3])
Attention Weights shape: torch.Size([1, 3, 3])
Context Vector shape: torch.Size([1, 3, 4])


tensor([[[ 0.6187, -0.8830,  0.1245, -0.6070],
         [ 0.2635, -0.1471, -0.1191,  0.0305],
         [ 0.4930, -0.6769,  0.0371, -0.3998]]], grad_fn=<UnsafeViewBackward0>)

### Seq2Seq 모델에 어텐션 추가

In [15]:
class Attention(nn.Module):
    def __init__(self,  hidden_size): 
        super(Attention, self).__init__() 
        self.attn = nn.Linear(hidden_size * 2, hidden_size) # Query와 Key를 결합하여 점수 계산
        self.v = nn.Parameter(torch.rand(hidden_size)) # 학습 가능한 벡터 파라미터
    
    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.shape[1]
        hidden_expanded = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))
        attention_scores = torch.sum(self.v * energy, dim=2)
        attn_weights = F.softmax(attention_scores, dim=1)

        context_vector = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context_vector, attn_weights

In [18]:
class Seq2SeqWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Seq2SeqWithAttention, self).__init__()
        self.encoder = nn.GRU(input_dim, hidden_dim, batch_first=True) 
        self.decoder = nn.GRU(input_dim, hidden_dim, batch_first=True) 
        self.attention = Attention(hidden_dim) 
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.decoder_input_transform = nn.Linear(input_dim, hidden_dim)

    def forward(self, encoder_input, decoder_input):
        encoder_outputs, hidden = self.encoder(encoder_input)
        context_vector, _ = self.attention(hidden[-1], encoder_outputs)
        decoder_input = self.decoder_input_transform(decoder_input)
        output, _ = self.decoder(decoder_input, hidden)
        combined = torch.cat((output, context_vector.unsqueeze(1)), dim=2)
        return self.fc(combined)

In [19]:
batch_size = 1
seq_len = 5
input_dim = 10
hidden_dim = 20
output_dim = 15

encoder_input = torch.randn(batch_size, seq_len, input_dim)
decoder_input = torch.randn(batch_size, 1, input_dim)

model = Seq2SeqWithAttention(input_dim, hidden_dim, output_dim)
model(encoder_input, decoder_input)


RuntimeError: input.size(-1) must be equal to input_size. Expected 10, got 20