In [1]:
from src.trainer import s2sTrainer
from src.data import DeEndataset, KoEndataset
from torch.utils.data import DataLoader
import torch
import sys
from src.seq2seq import Encoder, Decoder, seq2seq

train_data_paths = [
    "/home/jack/torchstudy/02Feb/0_datas/korean-english-park.train.ko",
    "/home/jack/torchstudy/02Feb/0_datas/korean-english-park.train.en"
]


TrainDataset = KoEndataset(train_data_paths)

encoder_config = {
    "emb_dim" : 1000,
    "hid_dim" : 1000,
    "n_layers" : 4,
    "input_dim" : len(TrainDataset.src_vocab),
    "pad_idx" : TrainDataset.src_vocab.pad_idx
}

decoder_config = {
    "emb_dim" : 1000,
    "hid_dim" : 1000,
    "n_layers" : 4,
    "pad_idx" : TrainDataset.dst_vocab.pad_idx,
    "output_dim" : len(TrainDataset.dst_vocab)
}

encoder = Encoder(**encoder_config)
decoder = Decoder(**decoder_config)
seq2seq = seq2seq(encoder, decoder)
TrainDataset.src_vocab.set_most_common_dict(6000)
TrainDataset.dst_vocab.set_most_common_dict(6000)
TrainDataset.src_vocab.build_index_dict()
TrainDataset.dst_vocab.build_index_dict()

In [2]:
seq2seq

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(6004, 1000)
    (rnn): LSTM(1000, 1000, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (emb): Embedding(6004, 1000, padding_idx=0)
    (rnn): LSTM(1000, 1000, num_layers=4, dropout=0.5)
    (fc_out): Linear(in_features=1000, out_features=6004, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [3]:
def translate_sentence(model, token, max_length=50):
   
    text_to_indices = token
    sentence_tensor = torch.LongTensor(text_to_indices)
    model.eval()
    # Build encoder hidden, cell state
    with torch.no_grad():
        embedded = model.encoder.dropout(model.encoder.embedding(sentence_tensor))
        _, (hidden, cell) = model.encoder.rnn(embedded)

        outputs = [2]

        for _ in range(max_length):
            previous_word = torch.LongTensor([outputs[-1]])

            with torch.no_grad():
                output, hidden, cell = model.decoder(previous_word, hidden, cell)
                best_guess = output.argmax(1).item()

            outputs.append(best_guess)

            # Model predicts it's the end of the sentence
            if output.argmax(1).item() == 3:
                break

    return outputs

In [4]:
seq2seq.load_state_dict(torch.load("seq2seq-model-ko-en.pt"))

<All keys matched successfully>

In [5]:
"/home/jack/torchstudy/02Feb/0_datas/korean-english-park.dev.ko"
"/home/jack/torchstudy/02Feb/0_datas/korean-english-park.dev.en"

'/home/jack/torchstudy/02Feb/0_datas/korean-english-park.dev.en'

In [72]:
src = "사우디 한국 미국"

In [73]:
test_sample = torch.Tensor([TrainDataset.src_vocab.stoi(src.lower(), option="seq2seq", reverse=True)]).long().view(-1,1)

In [74]:
" ".join([TrainDataset.dst_vocab.index_dict[i] for i in translate_sentence(seq2seq, test_sample)])

'<SOS> the government has said it will <UNK> the <UNK> of the korean government . <EOS>'

In [1]:
from torchtext.data.metrics import bleu_score

In [25]:
from nltk.translate.bleu_score import sentence_bleu
reference = [['1', '2', '3', '4']]
candidate = ['1', '2', '3']
bleu = sentence_bleu(reference, candidate, weights=(0.3,0.3,0.3))

print(f'reference: {reference}')
print(f'candidate: {candidate}')
print('BLEU:', bleu)


reference: [['1', '2', '3', '4']]
candidate: ['1', '2', '3']
BLEU: 0.7165313105737893


In [32]:
from torchtext.data.metrics import bleu_score
candidate_corpus = [['1', '3', '4', '5'], ['Another', 'Sentence']]
references_corpus = [[['2', '3', '4', '5'], ['Completely', 'Different']], [['No', 'Match']]]
bleu = sentence_bleu(reference, candidate, weights=(0.3,0.3,0.3))
print(bleu)

0.7165313105737893
