In [1]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
new_tokenizer = Tokenizer.from_file("tokenizer.json")

In [4]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=new_tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)



In [5]:
wrapped_tokenizer("‚ñ∫ ‚óä‚óí‚ñ¥ ‚ñΩ‚ó†‚óì‚ó†‚ó≥‚ó†‚ñº‚ó† ‚ñ±‚óÇ‚ñ±‚óó‚ñª‚óß‚ñª ‚óï‚ó´‚óÄ‚óó‚ñµ")

{'input_ids': [2, 98, 8944, 28498, 6738, 27852, 19370, 1046, 3287, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch import nn

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import sys
import numpy as np
import torch
from tqdm.notebook import tqdm
from torch.optim import Adam
import os
import subprocess

from matplotlib import pyplot as plt
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
import shutil
import random
import itertools
from datasets import load_metric

bleu_metric = load_metric("bleu")

  bleu_metric = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [7]:
import json
train = []
with open('train') as f:
    for line in f:
        train.append(json.loads(line.strip()))
val = []
with open('val') as f:
    for line in f:
        val.append(json.loads(line.strip()))
with open('test_no_reference') as f:
  test = f.readlines()

In [8]:
dst=[]
src=[]
for i in range(len(train)):
    dst.append(train[i]['dst'])
    src.append(train[i]['src'])
import spacy
en_nlp = spacy.load("en_core_web_sm")
from datasets import load_dataset
train_zet = load_dataset('json', data_files='train', split="train")
valid_zet = load_dataset('json', data_files='val' , split="train")
test_zet = load_dataset('json', data_files='test_no_reference' , split="train")
train = train_zet
valid = valid_zet
pad_token = "<pad>"
from transformers import AlbertTokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')




In [9]:
def tokenize_example(example, en_nlp, wrapped_tokenizer, max_length, lower=True, sos_token="<sos>", eos_token="<eos>"):
    en_tokens = [token for token in en_nlp.tokenizer(example["dst"])][:max_length]
    de_tokens = [token for token in wrapped_tokenizer.tokenize(example["src"])][:max_length]
    if lower:
        en_tokens = [str(token).lower() for token in en_tokens]
        #de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    cls_token="[CLS]"
    sep_token="[SEP]"
    de_tokens = [cls_token] + de_tokens + [sep_token]
    
    return {"dst": en_tokens, "src": de_tokens}

In [10]:
max_length = 25
lower = True
sos_token = "<sos>"
eos_token = "<eos>"
fn_kwargs = {
    "en_nlp": en_nlp,
    "wrapped_tokenizer": wrapped_tokenizer,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
train_data = train_zet.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_zet.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
train_data['dst'][1]

['<sos>',
 'he',
 'would',
 'need',
 'to',
 'repeat',
 'his',
 'vows',
 'in',
 'the',
 'land',
 'of',
 'the',
 'living',
 'and',
 'drink',
 'from',
 'the',
 'wine',
 'of',
 'ages',
 '.',
 '<eos>']

In [12]:
def numericalize_example(example, tokenizer, wrapped_tokenizer):
    for i in range(len(example)):

        en_ids = (tokenizer(example["dst"]))
        de_ids = wrapped_tokenizer(example["src"])
        
    return {"en_ids": en_ids, "de_ids": de_ids}

In [13]:
fn_kwargs= {
    "tokenizer": tokenizer,
    "wrapped_tokenizer": wrapped_tokenizer}
train_num = train_zet.map(numericalize_example,  fn_kwargs=fn_kwargs)
valid_num = valid_zet.map(numericalize_example,  fn_kwargs=fn_kwargs)

In [53]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_num = train_num.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_num = valid_num.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)
def get_collate_fn(pad_index=1):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"]['input_ids'] for example in batch]
        batch_de_ids = [example["de_ids"]['input_ids'] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader
batch_size = 4
pad_index=0
train_data_loader = get_data_loader(train_num, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_num, batch_size, pad_index)
class Encoder(nn.Module):
    def __init__(
        self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout
    ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        # outputs are always from the last layer
        # hidden [-2, :, : ] is the last of the forwards RNN
        # hidden [-1, :, : ] is the last of the backwards RNN
        # initial decoder hidden is final hidden state of the forwards and backwards
        # encoder RNNs fed through a linear layer
        hidden = torch.tanh(
            self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        )
        # outputs = [src length, batch size, encoder hidden dim * 2]
        # hidden = [batch size, decoder hidden dim]
        return outputs, hidden


In [54]:
len(valid_data_loader)

125

In [55]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim, decoder_hidden_dim
        )
        self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch size, decoder hidden dim]
        # encoder_outputs = [src length, batch size, encoder hidden dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_length = encoder_outputs.shape[0]
        # repeat decoder hidden state src_length times
        hidden = hidden.unsqueeze(1).repeat(1, src_length, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # hidden = [batch size, src length, decoder hidden dim]
        # encoder_outputs = [batch size, src length, encoder hidden dim * 2]
        energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))
        # energy = [batch size, src length, decoder hidden dim]
        attention = self.v_fc(energy).squeeze(2)
        # attention = [batch size, src length]
        return torch.softmax(attention, dim=1)

In [37]:
class Decoder(nn.Module):
    def __init__(
        self,
        output_dim,
        embedding_dim,
        encoder_hidden_dim,
        decoder_hidden_dim,
        dropout,
        attention,
    ):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((encoder_hidden_dim * 2) + embedding_dim, decoder_hidden_dim)
        self.fc_out = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim + embedding_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input = [batch size]
        # hidden = [batch size, decoder hidden dim]
        # encoder_outputs = [src length, batch size, encoder hidden dim * 2]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        a = self.attention(hidden, encoder_outputs)
        # a = [batch size, src length]
        a = a.unsqueeze(1)
        # a = [batch size, 1, src length]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch size, src length, encoder hidden dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        # weighted = [batch size, 1, encoder hidden dim * 2]
        weighted = weighted.permute(1, 0, 2)
        # weighted = [1, batch size, encoder hidden dim * 2]
        rnn_input = torch.cat((embedded, weighted), dim=2)
        # rnn_input = [1, batch size, (encoder hidden dim * 2) + embedding dim]
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        # output = [seq length, batch size, decoder hid dim * n directions]
        # hidden = [n layers * n directions, batch size, decoder hid dim]
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, decoder hidden dim]
        # hidden = [1, batch size, decoder hidden dim]
        # this also means that output == hidden
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        # prediction = [batch size, output dim]
        return prediction, hidden.squeeze(0), a.squeeze(1)

In [38]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        batch_size = src.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # encoder_outputs is all hidden states of the input sequence, back and forwards
        # hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
        # outputs = [src length, batch size, encoder hidden dim * 2]
        # hidden = [batch size, decoder hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, decoder hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [None]:
input_dim = len(wrapped_tokenizer)
output_dim = len(tokenizer)
encoder_embedding_dim = 4
decoder_embedding_dim = 4
encoder_hidden_dim = 8
decoder_hidden_dim = 8
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    decoder_dropout,
    attention,
)

model = Seq2Seq(encoder, decoder, device).to(device)

NameError: name 'wrapped_tokenizer' is not defined

: 

In [247]:
def init_weights(m):
    for name, param in m.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5000, 4)
    (rnn): GRU(4, 8, bidirectional=True)
    (fc): Linear(in_features=16, out_features=8, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn_fc): Linear(in_features=24, out_features=8, bias=True)
      (v_fc): Linear(in_features=8, out_features=1, bias=False)
    )
    (embedding): Embedding(1500, 4)
    (rnn): GRU(20, 8)
    (fc_out): Linear(in_features=28, out_features=1500, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [248]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 71,236 trainable parameters


In [249]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters())
criterion = nn.CrossEntropyLoss()
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        
        src = batch["de_ids"].to(device)
        
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, 0)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    print(epoch_loss / len(data_loader))    
    return epoch_loss / len(data_loader)
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        
        for i, batch in enumerate(data_loader):
            
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    print(epoch_loss / len(data_loader))
    return epoch_loss / len(data_loader)

In [250]:
import tqdm
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut3-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [229]:
model.load_state_dict(torch.load("tut3-model.pt"))

  model.load_state_dict(torch.load("tut3-model.pt"))


<All keys matched successfully>

In [2]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    wrapped_tokenizer,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        
        ids = wrapped_tokenizer(sentence)['input_ids']
     
        
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        encoder_outputs, hidden = model.encoder(tensor)
        inputs = [1]
        attentions = torch.zeros(max_output_length, 1, len(ids))
        inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
        
        for i in range(max_output_length):

            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, attention = model.decoder(
                inputs_tensor, hidden, encoder_outputs
            )
     
            attentions[i] = attention
            predicted_token = output.argmax(-1).item()
           
            inputs.append(predicted_token)
        
            if predicted_token == [1]:
                break
        en_tokens = tokenizer.decode(inputs)
    return en_tokens, sentence, attentions[: len(en_tokens) - 1]

In [231]:
sentence = valid_zet['src'][10]
sentence

'‚ó≤‚ñ®‚ó´ ‚ñΩ‚ñ™‚ñ±‚óà‚ó† ‚óÄ‚óó‚óì ‚óà‚ñ©‚ñ¢‚ó™‚ñ¶‚ñ±‚ñ¥‚ñ¶‚ó™‚ñ¶ ‚ó™‚ñ´‚ñ®‚ó´‚ñ¶‚ñ±‚óó‚óê‚ñ¥ 12‚ñµ ‚ñ®‚ñ¥‚ñ¢ ‚ñ®‚ó†‚ñ´‚ó¨‚ñ±‚ó†‚óì‚ó†‚ñ® ‚óì‚ó™‚ñ®‚óÇ‚óì ‚ñ®‚ñ™‚óì‚ó†‚ñ¶ ‚óÑ‚óó‚ñº‚ñ®‚ñ¥‚ñ±‚óû‚óß‚ñ¶‚ñ† ‚ñº‚ó®‚óé‚ó†‚óì‚ñ´‚ñ¥‚óû‚óó ‚óï‚ó™‚óì‚óâ‚ñ¥‚ñ®‚ñ±‚ó™‚óí‚ñ´‚ó´‚óì‚óó‚ñ±‚ó™‚ñ¶ ‚óç‚óÇ‚ó®‚óì‚óÄ‚ó†‚ñ±‚ñ± ‚óö‚ó™ ‚óà‚ó£‚óì‚ñ´‚ñ±‚ó≠ (‚óç‚óß‚ó®‚óì‚óû‚óÇ‚óé‚ñ¥) ‚óÇ‚ñΩ‚ñæ‚ñ¶‚ñ±‚ó†‚óì‚ñ™‚ñ¶‚óà‚ó† ‚ó¢‚ó†‚ñª‚ñ´‚ó†‚ñ¶ ‚ó°‚óó‚óé ‚óå‚ó®‚óì‚ó≥‚ñ® ‚ñ´‚ó†‚óì‚ó†‚óç‚ñ™‚ñ¶‚óà‚ó†‚ñ¶ ‚óÇ‚ñΩ‚ó®‚ñ¶‚óà‚ó†‚ñ¶ ‚ó†‚ñ±‚ó¨‚ñ¶‚óà‚ñ™‚ñµ'

In [232]:
translation, sentence_tokens, attention = translate_sentence(
    sentence,
    model,
    en_nlp,
    wrapped_tokenizer,
    lower,
    sos_token,
    eos_token,
    device,
)

In [233]:
translation

'<unk> the, the, the, the, the, the, the, the, the<pad><pad><pad><pad><pad><pad><pad><pad>'

In [234]:
sentence_tokens

'‚ó≤‚ñ®‚ó´ ‚ñΩ‚ñ™‚ñ±‚óà‚ó† ‚óÄ‚óó‚óì ‚óà‚ñ©‚ñ¢‚ó™‚ñ¶‚ñ±‚ñ¥‚ñ¶‚ó™‚ñ¶ ‚ó™‚ñ´‚ñ®‚ó´‚ñ¶‚ñ±‚óó‚óê‚ñ¥ 12‚ñµ ‚ñ®‚ñ¥‚ñ¢ ‚ñ®‚ó†‚ñ´‚ó¨‚ñ±‚ó†‚óì‚ó†‚ñ® ‚óì‚ó™‚ñ®‚óÇ‚óì ‚ñ®‚ñ™‚óì‚ó†‚ñ¶ ‚óÑ‚óó‚ñº‚ñ®‚ñ¥‚ñ±‚óû‚óß‚ñ¶‚ñ† ‚ñº‚ó®‚óé‚ó†‚óì‚ñ´‚ñ¥‚óû‚óó ‚óï‚ó™‚óì‚óâ‚ñ¥‚ñ®‚ñ±‚ó™‚óí‚ñ´‚ó´‚óì‚óó‚ñ±‚ó™‚ñ¶ ‚óç‚óÇ‚ó®‚óì‚óÄ‚ó†‚ñ±‚ñ± ‚óö‚ó™ ‚óà‚ó£‚óì‚ñ´‚ñ±‚ó≠ (‚óç‚óß‚ó®‚óì‚óû‚óÇ‚óé‚ñ¥) ‚óÇ‚ñΩ‚ñæ‚ñ¶‚ñ±‚ó†‚óì‚ñ™‚ñ¶‚óà‚ó† ‚ó¢‚ó†‚ñª‚ñ´‚ó†‚ñ¶ ‚ó°‚óó‚óé ‚óå‚ó®‚óì‚ó≥‚ñ® ‚ñ´‚ó†‚óì‚ó†‚óç‚ñ™‚ñ¶‚óà‚ó†‚ñ¶ ‚óÇ‚ñΩ‚ó®‚ñ¶‚óà‚ó†‚ñ¶ ‚ó†‚ñ±‚ó¨‚ñ¶‚óà‚ñ™‚ñµ'

predict

In [235]:
trans=[]
sente=[]
for i in tqdm.tqdm(test):
    translation_i, sentence_tokens_i, attention_i=translate_sentence(
    i['src'],
    model,
    en_nlp,
    wrapped_tokenizer,
    lower,
    sos_token,
    eos_token,
    device,
)

    
    trans.append(translation_i)
    sente.append(sentence_tokens_i)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [01:25<00:00, 11.71it/s]


In [236]:
trans

['<unk> the, the the the the the the the the the, the, the<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<unk>,,,,,,,,,,,,,,,,,,,,,,,,,',
 '<unk>,,,,,,,,,,,,,,,,,,,,,,,,,',
 '<unk> the the the the the the the the the the the the<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<unk> the the the the the the the the the the the, the<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<unk>the, the, the, the, the, the, the, the, the, the, the,,,,',
 '<unk>,,,,,,,,,,,,,,,,,,,,,,,,,',
 '<unk>,, the, the, the,,,,,,,,,,,,,,,,,,',
 '<unk> the the the the the the the the the the the the the the<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<unk>, the, the, the, the, the, the, the,,,,,,,,,<pad><pad>',
 '<unk> the the the the the the the the the the the the the<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<unk> the the the the the the the the the the the the the, the, the, the<pad><pad><pad><pad><pad><pad>',
 '<unk>,,,,,,,,,,

In [237]:
sente

['‚ó≤‚ñ¶‚ó†‚ñ¶‚ó¨‚ñ¶‚ñ† ‚óâ‚óó‚ñ¢‚óï‚óó ‚óç‚óó‚ñ±‚óé ‚ñΩ‚ó†‚ñΩ‚ñ™‚ñ¶‚ó† ‚óï‚ñ¥‚óâ‚óó‚ñ¶‚ñº‚ñ¥ ‚óÄ‚óó‚óì‚óâ‚óß‚ñ® ‚óé‚ñ¥‚óû‚ó†‚ñ∏ ‚ó†‚ñ±‚óà‚ñ™‚ñ® ‚óö‚ó™ ‚óÄ‚ó® ‚óé‚ó™‚óû‚ó†‚ñ∏‚ñ±‚ó†‚óì‚ó¨‚ñ¶ ‚óÄ‚ó†‚ñ¢‚ñ™‚ñ±‚ó†‚óì‚ñ™ ‚ñª‚ó™‚ñ® ‚óà‚óÇ‚óû‚ñ´‚óâ‚ó† ‚óà‚ñ¥‚óê‚ó´‚ñ±‚óà‚óó‚ñµ',
 '‚ñØ‚ñ¥‚ñ• ‚óü‚óß‚óì‚ñ®‚ñ±‚ó® ‚óÄ‚ó´‚óì ‚óà‚ó†‚óà‚ó¨‚ñ† ‚óâ‚óÇ‚ñº‚ó®‚óê‚ó®‚ñ¶ ‚ó†‚ñ¶‚ñ¶‚ó™‚óû‚óó‚ñ¶‚óó‚ñ¶ ‚ñΩ‚ó†‚ñ¢‚óà‚ó¨‚óê‚ñ™ ‚óö‚ó™ ‚ó≥‚ó†‚ñ¶‚ñ±‚ñ™‚óí‚ñ±‚ñ™‚ñ®‚ñ±‚ó† ‚ñ®‚ñ¥‚ñ¶‚óà‚óó‚óû‚óó‚ñ¶‚ñ¥ ‚óï‚ó£‚ñ¶‚óà‚ñ¥‚óì‚óà‚óó‚óê‚ó´‚ñ† "‚óÄ‚ó´‚óì ‚óû‚ó´‚ó≥‚ó†‚ñ∑‚óó ‚óà‚ó†‚ñ∑‚ó†" ‚ó≥‚ó†‚ñ¢‚ó†‚ñ¶ ‚óÄ‚óó‚óì ‚óé‚ñ¥‚óû‚ó†‚ñ∏‚óà‚ó†‚ñ¶ ‚óû‚óß‚ñ¶‚óì‚ó† ‚ñ®‚ó™‚ñ¶‚óà‚ó´‚óû‚óó‚ñ¶‚ó™ ‚ó†‚ó≥‚óì‚ñ™‚óé‚ñº‚ó¨‚ñ±‚ó¨‚ñ® ‚ó≥‚ó†‚ñª‚ñ™‚ñ±‚óà‚ñ™‚óê‚ñ™ ‚óö‚ó™ ‚óó‚óí‚ó´‚ñ¶‚óà‚ó™‚ñ¶ ‚ñ®‚óÇ‚óö‚ó®‚ñ±‚óà‚ñæ‚óê‚ñæ ‚ó´‚óâ‚ó´‚ñ¶ ‚óâ‚óó‚óç‚ñ´‚ñ¥ ‚óà‚ó†‚óö‚ó† ‚ó†‚óâ‚ó¨‚ñΩ‚óÇ‚óì‚ñµ',
 '‚ó°‚ó†‚ñª‚óß‚ñ¶ ‚óÇ‚ñ´‚óß‚óé‚óÇ‚óÄ‚óó‚ñ± ‚óç‚óó‚óì‚óé‚ó†‚óû‚ó¨ ‚ó†‚ñ¶‚ñ±‚ó†‚óí‚óé‚ó†‚óû‚ñ™‚ñ¢ ‚óù‚óì‚ñ¥‚ñπ‚óó‚ñ´ ‚óà‚ó®‚óì‚ñæ‚óé‚ñæ‚ñ¶‚óà‚ó† ‚óû‚ñ™‚ñ¶‚ó¨‚óì‚óà‚ó† ‚óÄ‚ó™‚ñ®‚ñ

In [183]:
import jsonlines
with jsonlines.open('output.jsonl', mode='w') as writer:
    for ds, sr in zip(trans, sente):
        writer.write({"dst":ds, "src":sr})