# 기계번역

### pyTorch
### encoder & decoder (GRU model base)

GRU (Gated Recurrent Unit)

In [3]:
# 필요한 라이브러리 설치
!pip install torch




In [11]:


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import random
import re
from collections import Counter

# 샘플 한영 번역 데이터 (더 많은 데이터로 확장)
korean_sentences = [
    "안녕하세요",
    "오늘 날씨가 좋습니다",
    "나는 학생입니다",
    "이 책이 재미있어요",
    "내일 만나요",
    "고맙습니다",
    "죄송합니다",
    "어디에 가세요",
    "물 한 잔 주세요",
    "시간이 몇 시예요",
    "한국 음식을 좋아해요",
    "영화를 보러 갑시다",
    "공부를 열심히 해요",
    "친구와 만났어요",
    "집에 가고 싶어요"
]

english_sentences = [
    "hello",
    "the weather is nice today",
    "i am a student",
    "this book is interesting",
    "see you tomorrow",
    "thank you",
    "i am sorry",
    "where are you going",
    "give me a glass of water",
    "what time is it",
    "i like korean food",
    "let us go watch a movie",
    "study hard",
    "i met a friend",
    "i want to go home"
]

# 특수 토큰 정의
SOS_token = 0
EOS_token = 1
PAD_token = 2

# 언어 클래스
class Language:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOS": 0, "EOS": 1, "PAD": 2}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "PAD"}
        self.n_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

# 데이터 전처리
def preprocess_sentence(sentence):
    # 간단한 전처리: 소문자 변환, 구두점 제거
    sentence = sentence.lower().strip()
    sentence = re.sub(r"[.!?]", "", sentence)
    return sentence

# 언어 객체 생성 및 어휘 구축
input_lang = Language('korean')
output_lang = Language('english')

# 전처리된 문장 쌍
pairs = []
for ko, en in zip(korean_sentences, english_sentences):
    ko_processed = preprocess_sentence(ko)
    en_processed = preprocess_sentence(en)
    pairs.append([ko_processed, en_processed])
    input_lang.addSentence(ko_processed)
    output_lang.addSentence(en_processed)

print(f"한국어 어휘 크기: {input_lang.n_words}")
print(f"영어 어휘 크기: {output_lang.n_words}")

# 기본 인코더 모델
class BasicEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(BasicEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        output, hidden = self.gru(embedded)
        return output, hidden

# 기본 디코더 모델
class BasicDecoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(BasicDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_token, hidden):
        # input_token이 스칼라일 수 있으므로 명시적으로 차원 조정
        if input_token.dim() == 0:
            input_token = input_token.view(1, 1)
        elif input_token.dim() == 1:
            input_token = input_token.view(1, -1)

        embedded = self.embedding(input_token)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(output)
        return output, hidden

# 문장을 텐서로 변환
def indexesFromSentence(lang, sentence):
    return [lang.word2index.get(word, 2) for word in sentence.split(' ')]  # 2는 PAD

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long)

# 모델 초기화
hidden_size = 256
encoder = BasicEncoder(input_lang.n_words, hidden_size)
decoder = BasicDecoder(hidden_size, output_lang.n_words)

# 훈련 함수
def train_pair(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_hidden = torch.zeros(1, 1, encoder.hidden_size)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # 인코더 실행
    input_tensor = input_tensor.unsqueeze(0)  # [1, seq_len]
    encoder_output, encoder_hidden = encoder(input_tensor)

    # 디코더 실행
    decoder_input = torch.tensor([SOS_token])
    decoder_hidden = encoder_hidden

    loss = 0

    # Teacher forcing 사용
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output.view(-1, decoder_output.size(-1)), target_tensor[di:di+1])
        decoder_input = target_tensor[di]  # Teacher forcing

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

# 훈련 설정
learning_rate = 0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# 훈련 실행
print("번역 모델 훈련 시작...")
for epoch in range(1000):
    total_loss = 0
    for pair in pairs:
        input_tensor = tensorFromSentence(input_lang, pair[0])
        target_tensor = tensorFromSentence(output_lang, pair[1])
        loss = train_pair(input_tensor, target_tensor, encoder, decoder,
                         encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss

    if epoch % 200 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

# 번역 함수
def translate(sentence, max_length=20):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_tensor = input_tensor.unsqueeze(0)  # [1, seq_len]

        encoder_output, encoder_hidden = encoder(input_tensor)

        decoder_input = torch.tensor([SOS_token])
        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)

            if topi.item() == EOS_token:
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return ' '.join(decoded_words)

# 테스트
print("\n=== 번역 결과 ===")
test_sentences = [
    "안녕하세요",
    "고맙습니다",
    "나는 학생입니다",
    "한국 음식을 좋아해요"
]

for korean in test_sentences:
    english = translate(preprocess_sentence(korean))
    print(f"한국어: {korean}")
    print(f"번역: {english}")
    print("-" * 50)

한국어 어휘 크기: 39
영어 어휘 크기: 47
번역 모델 훈련 시작...
Epoch 0, Loss: 57.2339
Epoch 200, Loss: 0.1105
Epoch 400, Loss: 0.0465
Epoch 600, Loss: 0.0288
Epoch 800, Loss: 0.0206

=== 번역 결과 ===
한국어: 안녕하세요
번역: hello
--------------------------------------------------
한국어: 고맙습니다
번역: thank you
--------------------------------------------------
한국어: 나는 학생입니다
번역: i am a student
--------------------------------------------------
한국어: 한국 음식을 좋아해요
번역: i like korean food
--------------------------------------------------
