In [2]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
new_tokenizer = Tokenizer.from_file("tokenizer_Unigram_zetas.json")

In [3]:
new_tokenizer_en = Tokenizer.from_file("tokenizer_Unigram_english.json")

In [4]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=new_tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)



In [5]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer_eng = PreTrainedTokenizerFast(
    tokenizer_object=new_tokenizer_en,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [6]:
wrapped_tokenizer_eng('[SEP]')

{'input_ids': [35001, 1, 0], 'token_type_ids': [0, 0, 2], 'attention_mask': [1, 1, 1]}

In [7]:
wrapped_tokenizer("► ◊◒▴ ▽◠◓◠◳◠▼◠ ▱◂▱◗▻◧▻ ◕◫◀◗▵")

{'input_ids': [70, 8, 8221, 1181, 58, 139, 12220, 84, 3872, 100, 7521, 1, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch import nn

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import sys
import numpy as np
import torch
from tqdm.notebook import tqdm
from torch.optim import Adam
import os
import subprocess

from matplotlib import pyplot as plt
from datetime import datetime

import shutil
import random
import itertools

In [9]:
import json
train = []
with open('train') as f:
    for line in f:
        train.append(json.loads(line.strip()))
val = []
with open('val') as f:
    for line in f:
        val.append(json.loads(line.strip()))
with open('test_no_reference') as f:
    test = f.readlines()

In [10]:
len(train)

300000

In [11]:
dst=[]
src=[]
for i in range(len(train)):
    dst.append(train[i]['dst'])
    src.append(train[i]['src'])
import spacy
en_nlp = spacy.load("en_core_web_sm")
from datasets import load_dataset
train_zet = load_dataset('json', data_files='train', split="train")
valid_zet = load_dataset('json', data_files='val' , split="train")
test_zet = load_dataset('json', data_files='test_no_reference' , split="train")
train = train_zet
valid = valid_zet
pad_token = "<pad>"




In [12]:
def tokenize_example(example, wrapped_tokenizer_eng, wrapped_tokenizer, max_length, lower=True, sos_token="<sos>", eos_token="<eos>"):
    en_tokens = [token for token in wrapped_tokenizer_eng.tokenize(example["dst"])][:max_length]
    de_tokens = [token for token in wrapped_tokenizer.tokenize(example["src"])][:max_length]

    cls_token="[CLS]"
    sep_token="[SEP]"
    en_tokens = [cls_token] + en_tokens + [sep_token]

    de_tokens = [cls_token] + de_tokens + [sep_token]
    
    return {"dst": en_tokens, "src": de_tokens}

In [10]:
max_length = 25
lower = True
sos_token = "<sos>"
eos_token = "<eos>"
fn_kwargs = {
    "wrapped_tokenizer_eng": wrapped_tokenizer_eng,
    "wrapped_tokenizer": wrapped_tokenizer,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}
train_data = train_zet.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_zet.map(tokenize_example, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 300500/300500 [00:59<00:00, 5045.62 examples/s]
Map: 100%|██████████| 1121/1121 [00:00<00:00, 2929.22 examples/s]


In [77]:
train_data['dst'][1]

['[CLS]',
 '▁He',
 '▁would',
 '▁need',
 '▁to',
 '▁repeat',
 '▁his',
 '▁vows',
 '▁in',
 '▁the',
 '▁land',
 '▁of',
 '▁the',
 '▁living',
 '▁and',
 '▁drink',
 '▁from',
 '▁the',
 '▁wine',
 '▁of',
 '▁ages',
 '.',
 '[SEP]']

In [11]:
def numericalize_example(example, wrapped_tokenizer_eng, wrapped_tokenizer):
    for i in range(len(example)):

        en_ids = (wrapped_tokenizer_eng(example["dst"]))
        de_ids = wrapped_tokenizer(example["src"])
        
    return {"en_ids": en_ids, "de_ids": de_ids}

In [12]:
fn_kwargs= {
    "wrapped_tokenizer_eng": wrapped_tokenizer_eng,
    "wrapped_tokenizer": wrapped_tokenizer}
train_num = train_zet.map(numericalize_example,  fn_kwargs=fn_kwargs)
valid_num = valid_zet.map(numericalize_example,  fn_kwargs=fn_kwargs)

Map: 100%|██████████| 300500/300500 [02:11<00:00, 2288.44 examples/s]
Map: 100%|██████████| 1121/1121 [00:00<00:00, 1592.94 examples/s]


In [13]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_num = train_num.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_num = valid_num.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)
def get_collate_fn(pad_index=1):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"]['input_ids'] for example in batch]
        batch_de_ids = [example["de_ids"]['input_ids'] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader



In [97]:
train_num

Dataset({
    features: ['dst', 'src', 'en_ids', 'de_ids'],
    num_rows: 300500
})

In [None]:

batch_size = 64
pad_index=35002
train_data_loader = get_data_loader(train_num, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_num, batch_size, pad_index)
class Encoder(nn.Module):
    def __init__(
        self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout
    ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        # outputs are always from the last layer
        # hidden [-2, :, : ] is the last of the forwards RNN
        # hidden [-1, :, : ] is the last of the backwards RNN
        # initial decoder hidden is final hidden state of the forwards and backwards
        # encoder RNNs fed through a linear layer
        hidden = torch.tanh(
            self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        )
        # outputs = [src length, batch size, encoder hidden dim * 2]
        # hidden = [batch size, decoder hidden dim]
        return outputs, hidden


In [15]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim, decoder_hidden_dim
        )
        self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch size, decoder hidden dim]
        # encoder_outputs = [src length, batch size, encoder hidden dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_length = encoder_outputs.shape[0]
        # repeat decoder hidden state src_length times
        hidden = hidden.unsqueeze(1).repeat(1, src_length, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # hidden = [batch size, src length, decoder hidden dim]
        # encoder_outputs = [batch size, src length, encoder hidden dim * 2]
        energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))
        # energy = [batch size, src length, decoder hidden dim]
        attention = self.v_fc(energy).squeeze(2)
        # attention = [batch size, src length]
        return torch.softmax(attention, dim=1)

In [16]:
class Decoder(nn.Module):
    def __init__(
        self,
        output_dim,
        embedding_dim,
        encoder_hidden_dim,
        decoder_hidden_dim,
        dropout,
        attention,
    ):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((encoder_hidden_dim * 2) + embedding_dim, decoder_hidden_dim)
        self.fc_out = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim + embedding_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input = [batch size]
        # hidden = [batch size, decoder hidden dim]
        # encoder_outputs = [src length, batch size, encoder hidden dim * 2]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        a = self.attention(hidden, encoder_outputs)
        # a = [batch size, src length]
        a = a.unsqueeze(1)
        # a = [batch size, 1, src length]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs = [batch size, src length, encoder hidden dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        # weighted = [batch size, 1, encoder hidden dim * 2]
        weighted = weighted.permute(1, 0, 2)
        # weighted = [1, batch size, encoder hidden dim * 2]
        rnn_input = torch.cat((embedded, weighted), dim=2)
        # rnn_input = [1, batch size, (encoder hidden dim * 2) + embedding dim]
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        # output = [seq length, batch size, decoder hid dim * n directions]
        # hidden = [n layers * n directions, batch size, decoder hid dim]
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, decoder hidden dim]
        # hidden = [1, batch size, decoder hidden dim]
        # this also means that output == hidden
        assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        # prediction = [batch size, output dim]
        return prediction, hidden.squeeze(0), a.squeeze(1)

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        batch_size = src.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # encoder_outputs is all hidden states of the input sequence, back and forwards
        # hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)
        # outputs = [src length, batch size, encoder hidden dim * 2]
        # hidden = [batch size, decoder hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden state and all encoder hidden states
            # receive output tensor (predictions) and new hidden state
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, decoder hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [18]:
valid_data_loader

<torch.utils.data.dataloader.DataLoader at 0x788fb30fd520>

In [86]:
len(wrapped_tokenizer_eng)

35005

In [19]:
input_dim = len(wrapped_tokenizer)
output_dim = len(wrapped_tokenizer_eng)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
encoder_hidden_dim = 512
decoder_hidden_dim = 512
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    decoder_dropout,
    attention,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [103]:
def init_weights(m):
    for name, param in m.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(35005, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn_fc): Linear(in_features=1536, out_features=512, bias=True)
      (v_fc): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(35005, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=35005, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [104]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 87,119,805 trainable parameters


In [21]:
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=0.0002)
SCHEDULER_LAMBDA_PARAM = 0.95
lambda_scheduler = lambda x: SCHEDULER_LAMBDA_PARAM ** x
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_scheduler)
criterion = nn.CrossEntropyLoss()
def train_fn(
    model, data_loader,scheduler, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        
        src = batch["de_ids"].to(device)
        
        trg = batch["en_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, 0)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        #scheduler.step()
    print(epoch_loss / len(data_loader))    
    return epoch_loss / len(data_loader)
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        
        for i, batch in enumerate(data_loader):
            
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    print(epoch_loss / len(data_loader))
    return epoch_loss / len(data_loader)

In [None]:
import tqdm
n_epochs = 50
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader, 
        scheduler,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut3-model_1.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|          | 0/50 [00:00<?, ?it/s]

0.6838146885143025
0.8546394921011395


  2%|▏         | 1/50 [16:24<13:23:58, 984.46s/it]

	Train Loss:   0.684 | Train PPL:   1.981
	Valid Loss:   0.855 | Valid PPL:   2.351
0.6798759211279972
0.8433541390630934


  4%|▍         | 2/50 [32:49<13:07:43, 984.66s/it]

	Train Loss:   0.680 | Train PPL:   1.974
	Valid Loss:   0.843 | Valid PPL:   2.324


In [None]:
model.load_state_dict(torch.load("tut3-model_1.pt"))

In [23]:
def translate_sentence(
    sentence,
    model,
    wrapped_tokenizer_eng,
    wrapped_tokenizer,
    lower,
    sos_token,
    eos_token,
    device,
    max_output_length=50,
):
    model.eval()
    with torch.no_grad():
        
        ids = wrapped_tokenizer(sentence)['input_ids']
     
        
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        encoder_outputs, hidden = model.encoder(tensor)
        prev=35003
        prev_prev=1
        inputs = [35003]
        attentions = torch.zeros(max_output_length, 1, len(ids))
        inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
        
        for i in range(max_output_length):

            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, attention = model.decoder(
                inputs_tensor, hidden, encoder_outputs
            )
     
            attentions[i] = attention
            predicted_token = output.argmax(-1).item()
            if predicted_token==prev:
                continue
            if predicted_token==1:
                continue
            
            if predicted_token == 0:
                break    
            inputs.append(predicted_token)
            prev_prev = prev
            prev = predicted_token
            
            
            
        en_tokens = wrapped_tokenizer_eng.decode(inputs)
    return en_tokens, sentence, attentions[: len(en_tokens) - 1]

In [22]:
def translate_sentence(
    sentence,
    model,
    en_nlp,
    wrapped_tokenizer,
    lower,
    sos_token,
    eos_token,
    device,
    beam_width=3,
    max_output_length=50,
):
    model.eval()
    with torch.no_grad():
        
        ids = wrapped_tokenizer(sentence)['input_ids']
     
        sos_token ='[CLS]'
        eos_token = '<|endoftext|>'
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        encoder_outputs, hidden = model.encoder(tensor)
        beams = [(wrapped_tokenizer([sos_token])['input_ids'][0], 0.0, hidden)] 
        inputs = [35003]
        attentions = torch.zeros(max_output_length, 1, len(ids))
        inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
        prev=35003
       
        for i in range(max_output_length):
            new_beams=[]
         
            for b_idx, (sequence, score, hidden_state) in enumerate(beams):
                
                if sequence[-1] == eos_token:
                    new_beams.append((sequence, score, hidden_state))
                    attentions[b_idx] = attentions[i - 1, b_idx, :]
                    continue
                #print(beams)
                last_token_tensor = torch.LongTensor([sequence[-1]]).to(device)
                output, hidden_state, attention = model.decoder(
                    last_token_tensor, hidden_state, encoder_outputs
                )
                attentions[b_idx] = attention
                
                topk_probs, topk_indices = output.topk(beam_width)
                for prob, token in zip(topk_probs.squeeze().tolist(), topk_indices.squeeze().tolist()):
                    if len(sequence) > 0 and token == sequence[-1]:
                        continue
                    new_sequence = sequence + [token]
                    new_score = score + torch.log(torch.tensor(prob))
                    new_beams.append((new_sequence, new_score.item(), hidden_state))
              
            
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
            #print(beams)
            # Stop early if all beams have ended
            if all(seq[-1] == eos_token for seq, _, _ in beams):
                break
        
            best_sequence, best_score, _ = beams[0]
            
        en_tokens = wrapped_tokenizer_eng.decode(best_sequence)
        #    inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
        #    output, hidden, attention = model.decoder(
        #        inputs_tensor, hidden, encoder_outputs
        #    )
     
        #    attentions[i] = attention
        #    predicted_token = output.argmax(-1).item()
        #    if predicted_token==prev:
        #        continue
        #    if predicted_token==1:
        #        continue
            
        #    if predicted_token == 0:
        #        break    
            
            
        #    prev = predicted_token
        #    inputs.append(predicted_token)
            
            
        #en_tokens = wrapped_tokenizer_eng.decode(inputs)
    return en_tokens, sentence, attentions[: len(best_sequence) - 1]

In [23]:
from datasets import load_dataset
test_zet = load_dataset('json', data_files='test_no_reference' , split="train")

In [25]:
test_zet['src']

['◲▦◠▦◬▦■ ◉◗▢◕◗ ◍◗▱◎ ▽◠▽▪▦◠ ◕▴◉◗▦▼▴ ◀◗◓◉◧▨ ◎▴◞◠▸ ◠▱◈▪▨ ◚◪ ◀◨ ◎◪◞◠▸▱◠◓◬▦ ◀◠▢▪▱◠◓▪ ▻◪▨ ◈◂◞▫◉◠ ◈▴◐◫▱◈◗▵',
 '▯▴▥ ◟◧◓▨▱◨ ◀◫◓ ◈◠◈◬■ ◉◂▼◨◐◨▦ ◠▦▦◪◞◗▦◗▦ ▽◠▢◈◬◐▪ ◚◪ ◳◠▦▱▪◒▱▪▨▱◠ ▨▴▦◈◗◞◗▦▴ ◕◣▦◈▴◓◈◗◐◫■ "◀◫◓ ◞◫◳◠▷◗ ◈◠▷◠" ◳◠▢◠▦ ◀◗◓ ◎▴◞◠▸◈◠▦ ◞◧▦◓◠ ▨◪▦◈◫◞◗▦◪ ◠◳◓▪◎▼◬▱◬▨ ◳◠▻▪▱◈▪◐▪ ◚◪ ◗◒◫▦◈◪▦ ▨◂◚◨▱◈▾◐▾ ◫◉◫▦ ◉◗◍▫▴ ◈◠◚◠ ◠◉◬▽◂◓▵',
 '◡◠▻◧▦ ◂▫◧◎◂◀◗▱ ◍◗◓◎◠◞◬ ◠▦▱◠◒◎◠◞▪▢ ◝◓▴▹◗▫ ◈◨◓▾◎▾▦◈◠ ◞▪▦◬◓◈◠ ◀◪▨▱▴◎◪ ◞◭◓◪◞◫▦◫▦ ◨▢◠◎◠◞▪▦▪▦ ◫▦◞◠▦▱◠◓▪▦ ◗◒◗▦◫ ▨◠▽◀◪▫◎◪◞◫▦◪ ▦▴◈◪▦ ◂▱◠◀◫▱▴▼▴◐◫▦◫ ◞◇◳▱◪◈◗▵',
 "◝▾◀◀◠ ▰◠▫◞◂▦ ◚▴ ▰▴◀◀ ▮◫◎▻◞◂▦■ ◞◠◀◠▷ ◂◳▦◠▦◠▦ ◍◂◨◓◀◠▱▱ ◎◠◉▱◠◓▪▦▪▦ ▨◠▷◓◠◎◠▦▪ ▮▴◓◕◫◧ ◆◠◓▼◗◠'▽◬ ◚◪ ○▱▴▹ ▯◂◓▴▦ ◫▱▴ ▴◒▱▴◒▫◗◐◫ ◎◠◉▫◠ ▨◬◞◠ ◞▩◓◪◈◪ ◎◠◐▱▾▻ ▴▫▫◗▵",
 '"○◐▱◠◈◬◐▪▦▪ ◕◣◓◎◪▱◪◓◗▦◪ ◠◞▱◠ ◗▢◫▦ ◚▴◓◎▴■" ◈◪◈◫ ◀◠▦◠▵',
 '▭◠◀▴◓▴ ◕◇◓▴ ▫◠▦▪▨▱◠◓ ◧▱◠▽ ▽◪◓◫▦◈▴▦ ▨◠◉◠▦ ◀◗◓◈▴▦ ◍◠▢▱◠ ◒◭▻▷◪▱◫ ◧▱◈▾◐▾▦◨ ◞◇▽▱◪◈◗■ ◠▦▼◠▨ ▻◂▱◗◞ ◂▱◠◳◠ ▨◠◓◬◒◠▦ ▨◗◒◫ ◞◠▽◬◞▪▦▪ ◀▴▱◗◓▫◎▴◈◗▵',
 '◆▴◓◗▱◫◎■ ◄◠◈◓◫◈ ▫◠◓◠◍▪▦◈◠▦ ◳◠◞◠ ◈◬◒▪ ◗▱◠▦ ▴◈◫▱▴▦ ◠▦▼◠▨ ◠▽◓◬▱◬▨◉▪ ◢◠▫◠▱◠▦▱◠◓▼◠ ▨◨▫▱◠▦◠▦ 1 ◰▨◫◎ ◓▴◍▴◓◠▦◈▾◎▾▦◈◠▦ ◀◫◓ ◳◬▱ ◞◧▦◓◠ ◈◠ ◀◠◐◬◎◞◬▢▱▪▨ ◳◠▦▱◬◞◬ ◀◇▱◕◪◈◪ ◳▩▨◞▴▨ ▨◠▱◈▪▵',
 '◝◂▱◗◚◳◠▱◬ ◈◫▻▱◧◎◠

In [None]:
translation, sentence_tokens, attention = translate_sentence(
    sentence,
    model,
    en_nlp,
    wrapped_tokenizer,
    lower,
    sos_token,
    eos_token,
    device,
)

In [None]:
translation

In [24]:
import tqdm
trans=[]
sente=[]
for i in tqdm.tqdm(test_zet['src']):
    translation_i, sentence_tokens_i, attention_i=translate_sentence(
    i,
    model,
    wrapped_tokenizer_eng,
    wrapped_tokenizer,
    False,
    0,
    0,
    device,
)

    
    trans.append(translation_i)
    sente.append(sentence_tokens_i)

100%|██████████| 1000/1000 [05:19<00:00,  3.13it/s]


In [25]:
trans

['[CLS]<sep><cls> admit that came many of on that came on the of the, and some and some of some of some shots some shots some shots<sep>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]',
 '[CLS]<sep><cls> York was a was a wrote a children mother and mother he a the a the and the and the a the a the a the a and a and a got a got a got<sep> got<sep>.<sep><cls><sep>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]',
 "[CLS]<sep><cls> told them the'd crazy to the to firm to the to the to the to the to the to the to the<sep>[PAD]<sep><cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]",
 '[CLS]<sep><cls> lazy to show and David, and, starting to play the by the morning, the while the while the while and while with while with while with<sep><cls><sep>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]<cls>[PAD]',
 '[CLS]<sep><cls> let me n

In [115]:
sente

['◲▦◠▦◬▦■ ◉◗▢◕◗ ◍◗▱◎ ▽◠▽▪▦◠ ◕▴◉◗▦▼▴ ◀◗◓◉◧▨ ◎▴◞◠▸ ◠▱◈▪▨ ◚◪ ◀◨ ◎◪◞◠▸▱◠◓◬▦ ◀◠▢▪▱◠◓▪ ▻◪▨ ◈◂◞▫◉◠ ◈▴◐◫▱◈◗▵',
 '▯▴▥ ◟◧◓▨▱◨ ◀◫◓ ◈◠◈◬■ ◉◂▼◨◐◨▦ ◠▦▦◪◞◗▦◗▦ ▽◠▢◈◬◐▪ ◚◪ ◳◠▦▱▪◒▱▪▨▱◠ ▨▴▦◈◗◞◗▦▴ ◕◣▦◈▴◓◈◗◐◫■ "◀◫◓ ◞◫◳◠▷◗ ◈◠▷◠" ◳◠▢◠▦ ◀◗◓ ◎▴◞◠▸◈◠▦ ◞◧▦◓◠ ▨◪▦◈◫◞◗▦◪ ◠◳◓▪◎▼◬▱◬▨ ◳◠▻▪▱◈▪◐▪ ◚◪ ◗◒◫▦◈◪▦ ▨◂◚◨▱◈▾◐▾ ◫◉◫▦ ◉◗◍▫▴ ◈◠◚◠ ◠◉◬▽◂◓▵',
 '◡◠▻◧▦ ◂▫◧◎◂◀◗▱ ◍◗◓◎◠◞◬ ◠▦▱◠◒◎◠◞▪▢ ◝◓▴▹◗▫ ◈◨◓▾◎▾▦◈◠ ◞▪▦◬◓◈◠ ◀◪▨▱▴◎◪ ◞◭◓◪◞◫▦◫▦ ◨▢◠◎◠◞▪▦▪▦ ◫▦◞◠▦▱◠◓▪▦ ◗◒◗▦◫ ▨◠▽◀◪▫◎◪◞◫▦◪ ▦▴◈◪▦ ◂▱◠◀◫▱▴▼▴◐◫▦◫ ◞◇◳▱◪◈◗▵',
 "◝▾◀◀◠ ▰◠▫◞◂▦ ◚▴ ▰▴◀◀ ▮◫◎▻◞◂▦■ ◞◠◀◠▷ ◂◳▦◠▦◠▦ ◍◂◨◓◀◠▱▱ ◎◠◉▱◠◓▪▦▪▦ ▨◠▷◓◠◎◠▦▪ ▮▴◓◕◫◧ ◆◠◓▼◗◠'▽◬ ◚◪ ○▱▴▹ ▯◂◓▴▦ ◫▱▴ ▴◒▱▴◒▫◗◐◫ ◎◠◉▫◠ ▨◬◞◠ ◞▩◓◪◈◪ ◎◠◐▱▾▻ ▴▫▫◗▵",
 '"○◐▱◠◈◬◐▪▦▪ ◕◣◓◎◪▱◪◓◗▦◪ ◠◞▱◠ ◗▢◫▦ ◚▴◓◎▴■" ◈◪◈◫ ◀◠▦◠▵',
 '▭◠◀▴◓▴ ◕◇◓▴ ▫◠▦▪▨▱◠◓ ◧▱◠▽ ▽◪◓◫▦◈▴▦ ▨◠◉◠▦ ◀◗◓◈▴▦ ◍◠▢▱◠ ◒◭▻▷◪▱◫ ◧▱◈▾◐▾▦◨ ◞◇▽▱◪◈◗■ ◠▦▼◠▨ ▻◂▱◗◞ ◂▱◠◳◠ ▨◠◓◬◒◠▦ ▨◗◒◫ ◞◠▽◬◞▪▦▪ ◀▴▱◗◓▫◎▴◈◗▵',
 '◆▴◓◗▱◫◎■ ◄◠◈◓◫◈ ▫◠◓◠◍▪▦◈◠▦ ◳◠◞◠ ◈◬◒▪ ◗▱◠▦ ▴◈◫▱▴▦ ◠▦▼◠▨ ◠▽◓◬▱◬▨◉▪ ◢◠▫◠▱◠▦▱◠◓▼◠ ▨◨▫▱◠▦◠▦ 1 ◰▨◫◎ ◓▴◍▴◓◠▦◈▾◎▾▦◈◠▦ ◀◫◓ ◳◬▱ ◞◧▦◓◠ ◈◠ ◀◠◐◬◎◞◬▢▱▪▨ ◳◠▦▱◬◞◬ ◀◇▱◕◪◈◪ ◳▩▨◞▴▨ ▨◠▱◈▪▵',
 '◝◂▱◗◚◳◠▱◬ ◈◫▻▱◧◎◠

In [None]:
import jsonlines
with jsonlines.open('output.json', mode='w') as writer:
    for ds, sr in zip(trans, sente):
        writer.write({"dst":ds.replace('<|endoftext|>','').replace('[CLS]','').replace('[PAD]','').replace('<sep>','').replace('<cls>',''), "src":sr})