In [1]:
import json
from transformers import BertTokenizer
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import BertTokenizer
import torch.nn as nn
import torch.utils as utils
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import tqdm
import torch.optim as optim

In [2]:
f=open("test.json","r",encoding="utf-8")

In [3]:
data=json.load(f)

In [4]:
data[0]

{'id': '190315_E001_13',
 'tag': 'meeting',
 'title': 'Meeting: Market update meeting',
 'original_language': 'en',
 'conversation': [{'no': 1,
   'en_speaker': 'Mr. John Smith',
   'ja_speaker': 'ジョン スミスさん',
   'en_sentence': 'How is it going, Wayne?',
   'ja_sentence': 'ウェイン、調子はどうです？'},
  {'no': 2,
   'en_speaker': 'Mr. Wayne Willis',
   'ja_speaker': 'ウェイン ウィリスさん',
   'en_sentence': "I'm not too bad.",
   'ja_sentence': 'まあまあです。'},
  {'no': 3,
   'en_speaker': 'Mr. John Smith',
   'ja_speaker': 'ジョン スミスさん',
   'en_sentence': 'Thank you very much for coming out today.',
   'ja_sentence': '今日はご足労ありがとう。'},
  {'no': 4,
   'en_speaker': 'Mr. John Smith',
   'ja_speaker': 'ジョン スミスさん',
   'en_sentence': "How's business lately?",
   'ja_sentence': '景気はどうです？'},
  {'no': 5,
   'en_speaker': 'Mr. Wayne Willis',
   'ja_speaker': 'ウェイン ウィリスさん',
   'en_sentence': "It's been good.",
   'ja_sentence': 'おかげさまで、順調です。'},
  {'no': 6,
   'en_speaker': 'Mr. Wayne Willis',
   'ja_speaker': 'ウェイン ウィリスさん',


In [5]:
english=[]
japanese=[]

In [6]:
for dat in data:
    for j in dat['conversation']:
        english.append(j['en_sentence'])
        japanese.append(j['ja_sentence'])

In [7]:
len(english)

2120

In [8]:
len(japanese)

2120

In [9]:
mx_eng=0
mx_jp=0
for i in range(len(english)):
    mx_eng=max(mx_eng,len(english[i]))
for i in range(len(japanese)):
    mx_jp=max(mx_jp,len(japanese[i]))    

Max of english and Max of Japanese tokens 

In [10]:
print(mx_eng,mx_jp)

207 95


In [11]:
bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")


  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [12]:
special_tokens = {
    "bos_token": "<sos>",
    "eos_token": "<eos>",
    "pad_token": "<pad>"
}

In [13]:
tokenizer.add_special_tokens(special_tokens)

3

In [14]:
jptokids=[]
jptorch=[]
for i in range(len(japanese)):
    ax = tokenizer.tokenize(japanese[i])
    ax = [tokenizer.bos_token] + ax + [tokenizer.eos_token]  
    jptokids.append(tokenizer.convert_tokens_to_ids(ax))
    jptorch.append(torch.tensor(jptokids[-1]))

This is using tokenizer.encode to automatically add eos and sos 

for i in range(len(japanese)):
    jptokids.append(tokenizer.encode(japanese[i], add_special_tokens=True))  
    print(jptokids[-1])

In [15]:
tokenizer.pad_token_id

32002

In [16]:
maxlen_jp=0
vocabsize_jp=0
for i in range(len(jptokids)):
    maxlen_jp=max(maxlen_jp,len(jptokids[i]))
    vocabsize_jp = max(vocabsize_jp,max(jptokids[i]))
    

In [17]:
maxlen_jp,vocabsize_jp

(59, 32001)

In [18]:
engtokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
engtokenizer.add_special_tokens(special_tokens)

3

In [20]:
engtokids=[]
engtorch=[]
for i in range(len(english)):
    ax=engtokenizer.tokenize(english[i])
    ax=[engtokenizer.bos_token]+ax+[engtokenizer.eos_token]
    engtokids.append(engtokenizer.convert_tokens_to_ids(ax))
    engtorch.append(torch.tensor(engtokids[-1]))

In [21]:
maxlen_eng=0
vocabsize_eng=0
for i in range(len(engtokids)):
    maxlen_eng=max(maxlen_eng,len(engtokids[i]))
    vocabsize_eng = max(vocabsize_eng,max(engtokids[i]))
    

In [22]:
maxlen_eng,vocabsize_eng,maxlen_jp,vocabsize_jp

(45, 30523, 59, 32001)

In [23]:
tokenizer.decode(jptokids[0], skip_special_tokens=False)

'<sos> ウェイン 、 調子 は どう です ? <eos>'

In [24]:
engtokenizer.decode(engtokids[0], skip_special_tokens=True)

'how is it going, wayne?'

In [25]:
engpadded= pad_sequence(engtorch, batch_first=True, padding_value=engtokenizer.pad_token_id)

In [26]:
jppadded= pad_sequence(jptorch,batch_first=True ,padding_value=tokenizer.pad_token_id)

In [27]:
class TranslationDataset(Dataset):
    def __init__(self, en_tensors, jp_tensors):
        self.en_tensors = en_tensors
        self.jp_tensors = jp_tensors

    def __len__(self):
        return len(self.en_tensors)

    def __getitem__(self, idx):
        # Return English and Japanese tensors for each sentence pair
        return self.en_tensors[idx], self.jp_tensors[idx]

In [28]:
def get_dataloader(en_tensors, jp_tensors, batch_size=32):
    dataset = TranslationDataset(en_tensors, jp_tensors)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [29]:
train_loader = get_dataloader(engpadded, jppadded, batch_size=32)

In [30]:
for i in next(iter(train_loader)):
    print(i)

tensor([[30522,  2044,  2035,  ..., 30524, 30524, 30524],
        [30522,  2011,  4566,  ..., 30524, 30524, 30524],
        [30522,  2079,  2017,  ..., 30524, 30524, 30524],
        ...,
        [30522,  2057,  1005,  ..., 30524, 30524, 30524],
        [30522,  2064,  2017,  ..., 30524, 30524, 30524],
        [30522,  2469,  1010,  ..., 30524, 30524, 30524]])
tensor([[32000,  5408, 30054,  ..., 32002, 32002, 32002],
        [32000,  2015, 29633,  ..., 32002, 32002, 32002],
        [32000,   654,     5,  ..., 32002, 32002, 32002],
        ...,
        [32000,  2941,  1974,  ..., 32002, 32002, 32002],
        [32000, 18659,     7,  ..., 32002, 32002, 32002],
        [32000,  8871,     6,  ..., 32002, 32002, 32002]])


In [31]:
for i in next(iter(train_loader)):
    print(i[0].shape)

torch.Size([45])
torch.Size([59])


In [32]:
class eencoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim) # For combining bidirectional outputs
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
        return outputs, hidden

In [45]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 3, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return nn.functional.softmax(attention, dim=1)

In [46]:
class deecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden.squeeze(0)

In [47]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder


    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        return outputs

In [48]:
engenc=eencoder(input_dim=31000,emb_dim=30,hid_dim=800,dropout=0.2)

In [49]:
att=Attention(800)

In [50]:
jpdec=deecoder(output_dim=32100,emb_dim=30,hid_dim=800,dropout=0.2,attention=att)

In [51]:
model = Seq2Seq(engenc, jpdec)

In [52]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [53]:
def train(model, src, trg, optimizer, criterion, clip):
    model.train()
    optimizer.zero_grad()
    output = model(src, trg)
    output_dim = output.shape[-1]
    output = output[:, 1:].reshape(-1, output_dim)
    trg = trg[:, 1:].reshape(-1)
    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    return loss.item()


CLIP = 1

In [54]:
N_EPOCHS=10

In [55]:
for epoch in range(N_EPOCHS):
    for i,j in enumerate(train_loader):
        loss = train(model, j[0], j[1], optimizer, criterion, CLIP)
        print(f'Epoch: {epoch+1:02} | Train Loss: {loss:.3f}')
        

RuntimeError: input.size(-1) must be equal to input_size. Expected 830, got 1630

for i,j in enumerate(train_loader):
    encoderhidden=engenc(j[0])
    jptext=j[1]
    jptextfirsttok=jptext[:,0]
    decodernext,decoderhidden=jpdec(jptextfirsttok,encoderhidden)
    print(decodernext.shape)
    

In [None]:
torch.save(model.state_dict(),"LANG TRANSLATION model save")

In [None]:
def translate_bert(model, sentence, en_tokenizer, ja_tokenizer, max_len=50, device='cpu'):
    model.eval()

    # Tokenize and numericalize the input sentence using BERT tokenizer
    inputs = en_tokenizer(sentence, return_tensors="pt").to(device)
    input_ids = inputs['input_ids']

    with torch.no_grad():
        hidden = model.encoder(input_ids)  # Encoder uses BERT embeddings

    trg_index = [ja_tokenizer.cls_token_id]  # Start with [CLS] token for BERT
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_index[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden)

        pred_token = output.argmax(1).item()
        trg_index.append(pred_token)

        if pred_token == ja_tokenizer.sep_token_id:  # Stop at [SEP] token
            break

    # Convert the numerical indices back to Japanese tokens
    ja_tokens = ja_tokenizer.convert_ids_to_tokens(trg_index)

    # Join the tokens to form the translated sentence (excluding [CLS] and [SEP])
    return ja_tokenizer.convert_tokens_to_string(ja_tokens[1:-1])

In [None]:
source_sentence = "Thank you"

In [None]:
translated_sentence = translate_bert(model, source_sentence, engtokenizer, tokenizer)

In [None]:
print(translated_sentence)

In [None]:
sent="Yeah, there is a lot of changes happening lately."

In [None]:
print(translate_bert(model, sent, engtokenizer, tokenizer))