In [1]:
import json
from transformers import BertTokenizer
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import BertTokenizer
import torch.nn as nn
import torch.utils as utils
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import tqdm
import torch.optim as optim

In [2]:
f=open("test.json","r",encoding="utf-8")

In [3]:
data=json.load(f)

In [4]:
data[0]

{'id': '190315_E001_13',
 'tag': 'meeting',
 'title': 'Meeting: Market update meeting',
 'original_language': 'en',
 'conversation': [{'no': 1,
   'en_speaker': 'Mr. John Smith',
   'ja_speaker': 'ジョン スミスさん',
   'en_sentence': 'How is it going, Wayne?',
   'ja_sentence': 'ウェイン、調子はどうです？'},
  {'no': 2,
   'en_speaker': 'Mr. Wayne Willis',
   'ja_speaker': 'ウェイン ウィリスさん',
   'en_sentence': "I'm not too bad.",
   'ja_sentence': 'まあまあです。'},
  {'no': 3,
   'en_speaker': 'Mr. John Smith',
   'ja_speaker': 'ジョン スミスさん',
   'en_sentence': 'Thank you very much for coming out today.',
   'ja_sentence': '今日はご足労ありがとう。'},
  {'no': 4,
   'en_speaker': 'Mr. John Smith',
   'ja_speaker': 'ジョン スミスさん',
   'en_sentence': "How's business lately?",
   'ja_sentence': '景気はどうです？'},
  {'no': 5,
   'en_speaker': 'Mr. Wayne Willis',
   'ja_speaker': 'ウェイン ウィリスさん',
   'en_sentence': "It's been good.",
   'ja_sentence': 'おかげさまで、順調です。'},
  {'no': 6,
   'en_speaker': 'Mr. Wayne Willis',
   'ja_speaker': 'ウェイン ウィリスさん',


In [5]:
english=[]
japanese=[]

In [6]:
for dat in data:
    for j in dat['conversation']:
        english.append(j['en_sentence'])
        japanese.append(j['ja_sentence'])

In [7]:
len(english)

2120

In [8]:
len(japanese)

2120

In [9]:
mx_eng=0
mx_jp=0
for i in range(len(english)):
    mx_eng=max(mx_eng,len(english[i]))
for i in range(len(japanese)):
    mx_jp=max(mx_jp,len(japanese[i]))    

Max of english and Max of Japanese tokens 

In [10]:
print(mx_eng,mx_jp)

207 95


In [11]:
bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")


  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [12]:
special_tokens = {
    "bos_token": "<sos>",
    "eos_token": "<eos>",
    "pad_token": "<pad>"
}

In [13]:
tokenizer.add_special_tokens(special_tokens)

3

In [14]:
jptokids=[]
jptorch=[]
for i in range(len(japanese)):
    ax = tokenizer.tokenize(japanese[i])
    ax = [tokenizer.bos_token] + ax + [tokenizer.eos_token]  
    jptokids.append(tokenizer.convert_tokens_to_ids(ax))
    jptorch.append(torch.tensor(jptokids[-1]))

This is using tokenizer.encode to automatically add eos and sos 

for i in range(len(japanese)):
    jptokids.append(tokenizer.encode(japanese[i], add_special_tokens=True))  
    print(jptokids[-1])

In [15]:
tokenizer.pad_token_id

32002

In [16]:
maxlen_jp=0
vocabsize_jp=0
for i in range(len(jptokids)):
    maxlen_jp=max(maxlen_jp,len(jptokids[i]))
    vocabsize_jp = max(vocabsize_jp,max(jptokids[i]))
    

In [17]:
maxlen_jp,vocabsize_jp

(59, 32001)

In [18]:
engtokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
engtokenizer.add_special_tokens(special_tokens)

3

In [20]:
engtokids=[]
engtorch=[]
for i in range(len(english)):
    ax=engtokenizer.tokenize(english[i])
    ax=[engtokenizer.bos_token]+ax+[engtokenizer.eos_token]
    engtokids.append(engtokenizer.convert_tokens_to_ids(ax))
    engtorch.append(torch.tensor(engtokids[-1]))

In [21]:
maxlen_eng=0
vocabsize_eng=0
for i in range(len(engtokids)):
    maxlen_eng=max(maxlen_eng,len(engtokids[i]))
    vocabsize_eng = max(vocabsize_eng,max(engtokids[i]))
    

In [22]:
maxlen_eng,vocabsize_eng,maxlen_jp,vocabsize_jp

(45, 30523, 59, 32001)

In [23]:
tokenizer.decode(jptokids[0], skip_special_tokens=False)

'<sos> ウェイン 、 調子 は どう です ? <eos>'

In [24]:
engtokenizer.decode(engtokids[0], skip_special_tokens=True)

'how is it going, wayne?'

In [25]:
engpadded= pad_sequence(engtorch, batch_first=True, padding_value=engtokenizer.pad_token_id)

In [26]:
jppadded= pad_sequence(jptorch,batch_first=True ,padding_value=tokenizer.pad_token_id)

In [27]:
class TranslationDataset(Dataset):
    def __init__(self, en_tensors, jp_tensors):
        self.en_tensors = en_tensors
        self.jp_tensors = jp_tensors

    def __len__(self):
        return len(self.en_tensors)

    def __getitem__(self, idx):
        # Return English and Japanese tensors for each sentence pair
        return self.en_tensors[idx], self.jp_tensors[idx]

In [28]:
def get_dataloader(en_tensors, jp_tensors, batch_size=32):
    dataset = TranslationDataset(en_tensors, jp_tensors)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [29]:
train_loader = get_dataloader(engpadded, jppadded, batch_size=32)

In [30]:
for i in next(iter(train_loader)):
    print(i)

tensor([[30522, 23156,  1012,  ..., 30524, 30524, 30524],
        [30522,  1045,  2064,  ..., 30524, 30524, 30524],
        [30522,  2031,  1037,  ..., 30524, 30524, 30524],
        ...,
        [30522,  2057,  2024,  ..., 30524, 30524, 30524],
        [30522,  1045,  2123,  ..., 30524, 30524, 30524],
        [30522,  4995,  1005,  ..., 30524, 30524, 30524]])
tensor([[32000,    73,  2087,  ..., 32002, 32002, 32002],
        [32000,    44, 14027,  ..., 32002, 32002, 32002],
        [32000, 13069,    52,  ..., 32002, 32002, 32002],
        ...,
        [32000,  5330,    13,  ..., 32002, 32002, 32002],
        [32000,  1276, 29323,  ..., 32002, 32002, 32002],
        [32000,    91,  2612,  ..., 32002, 32002, 32002]])


In [31]:
for i in next(iter(train_loader)):
    print(i[0].shape)

torch.Size([45])
torch.Size([59])


In [32]:
class eencoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)  
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        ot, hidden = self.rnn(embedded)
        #return ot
        return hidden.transpose(0,1) # Transpose to (batch_size, num_layers * num_directions, hidden_size)

In [33]:
class deecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)  # batch_first=True
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(1) # unsqueeze to (batch_size, 1)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden.transpose(0,1)) # Transpose hidden to (num_layers * num_directions, batch_size, hidden_size)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden.transpose(0,1) # Transpose hidden back to (batch_size, num_layers * num_directions, hidden_size)

In [34]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source_sequence, target_sequence):
        batch_size = target_sequence.shape[0]
        target_sequence_length = target_sequence.shape[1]
        target_vocabulary_size = self.decoder.output_dim

        decoder_outputs = torch.zeros(batch_size, target_sequence_length, target_vocabulary_size).to(source_sequence.device)

        encoder_final_hidden = self.encoder(source_sequence)

        decoder_input = target_sequence[:, 0]

        for time_step in range(1, target_sequence_length):
            decoder_prediction, decoder_hidden = self.decoder(decoder_input, encoder_final_hidden)

            decoder_outputs[:, time_step] = decoder_prediction

            # Always use teacher forcing: use the actual target token
            decoder_input = target_sequence[:, time_step]

            encoder_final_hidden = decoder_hidden

        return decoder_outputs

In [35]:
engenc=eencoder(input_dim=31000,emb_dim=30,hid_dim=800,dropout=0.2)

In [36]:
jpdec=deecoder(output_dim=32100,emb_dim=30,hid_dim=800,dropout=0.2)

In [37]:
model = Seq2Seq(engenc, jpdec)

In [38]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [39]:
def train(model, src, trg, optimizer, criterion, clip):
    model.train()
    optimizer.zero_grad()
    output = model(src, trg)
    output_dim = output.shape[-1]
    output = output[:, 1:].reshape(-1, output_dim)
    trg = trg[:, 1:].reshape(-1)
    loss = criterion(output, trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    return loss.item()

N_EPOCHS = 100
CLIP = 1

In [40]:
for epoch in range(N_EPOCHS):
    for i,j in enumerate(train_loader):
        loss = train(model, j[0], j[1], optimizer, criterion, CLIP)
        print(f'Epoch: {epoch+1:02} | Train Loss: {loss:.3f}')
        

Epoch: 01 | Train Loss: 10.382
Epoch: 01 | Train Loss: 10.324
Epoch: 01 | Train Loss: 10.265
Epoch: 01 | Train Loss: 10.112
Epoch: 01 | Train Loss: 9.532
Epoch: 01 | Train Loss: 7.458
Epoch: 01 | Train Loss: 6.702
Epoch: 01 | Train Loss: 6.374
Epoch: 01 | Train Loss: 6.282
Epoch: 01 | Train Loss: 6.475
Epoch: 01 | Train Loss: 6.451
Epoch: 01 | Train Loss: 6.678
Epoch: 01 | Train Loss: 6.632
Epoch: 01 | Train Loss: 6.969
Epoch: 01 | Train Loss: 6.994
Epoch: 01 | Train Loss: 6.272
Epoch: 01 | Train Loss: 6.307
Epoch: 01 | Train Loss: 6.357
Epoch: 01 | Train Loss: 6.271
Epoch: 01 | Train Loss: 6.206
Epoch: 01 | Train Loss: 6.512
Epoch: 01 | Train Loss: 6.097
Epoch: 01 | Train Loss: 6.088
Epoch: 01 | Train Loss: 6.242
Epoch: 01 | Train Loss: 5.990
Epoch: 01 | Train Loss: 6.097
Epoch: 01 | Train Loss: 5.928
Epoch: 01 | Train Loss: 6.355
Epoch: 01 | Train Loss: 6.045
Epoch: 01 | Train Loss: 6.080
Epoch: 01 | Train Loss: 6.064
Epoch: 01 | Train Loss: 6.151
Epoch: 01 | Train Loss: 6.197
Epoch:

KeyboardInterrupt: 

for i,j in enumerate(train_loader):
    encoderhidden=engenc(j[0])
    jptext=j[1]
    jptextfirsttok=jptext[:,0]
    decodernext,decoderhidden=jpdec(jptextfirsttok,encoderhidden)
    print(decodernext.shape)
    

In [47]:
torch.save(model.state_dict(),"LANG TRANSLATION model save")

In [None]:
def translate(model, sentence, en_vocab, ja_vocab, max_len=50):
    model.eval()
    tokens = [en_vocab['<SOS>']] + numericalize(sentence, en_vocab, tokenize_en) + [en_vocab['<EOS>']]
    src = torch.LongTensor(tokens).unsqueeze(0).to('cpu') # unsqueeze to (1, seq_len)
    with torch.no_grad():
        hidden = model.encoder(src)
    trg_index = [ja_vocab['<SOS>']]
    for _ in range(max_len):
        trg_tensor = torch.LongTensor([trg_index[-1]]).to('cpu')
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden)
        pred_token = output.argmax(1).item()
        trg_index.append(pred_token)
        if pred_token == ja_vocab['<EOS>']:
            break
    ja_tokens = [k for k, v in ja_vocab.items() if v in trg_index]
    return ''.join(ja_tokens[1:-1])

In [89]:
def translate(model,sent,en_tokenizer,jp_tokenizer,max_len=45):
    ax = en_tokenizer.tokenize(sent)
    ax = [en_tokenizer.bos_token] + ax + [en_tokenizer.eos_token] 
    print(len(ax))
    for i in range(45-len(ax)):
        ax=ax+[en_tokenizer.pad_token]
    print(len(ax))
    inptorch=(torch.tensor(en_tokenizer.convert_tokens_to_ids(ax)))
    inptorch=inptorch.unsqueeze(0)
    print(inptorch)
    hidden=model.encoder(inptorch)
    target=[jp_tokenizer.bos_token_id]
    for _ in range(max_len):
        targettensor=torch.LongTensor(target[-1])
        print(targettensor.shape,hidden.shape)
        output,hidden=model.decoder(targettensor,hidden)
        pred_token=output.argmax(1).item()
        target.append(pred_token)
        if pred_token==jp_tokenizer.eos_token:
            break
        print(pred_token)

In [90]:
translate(model,"what is your name?",engtokenizer,tokenizer)

7
45
tensor([[30522,  2054,  2003,  2115,  2171,  1029, 30523, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524]])
torch.Size([32000]) torch.Size([1, 1, 800])


IndexError: index out of range in self

In [91]:
engtokenizer.decode([30522,  2054,  2003,  2115,  2171,  1029, 30523, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524, 30524,
         30524, 30524, 30524, 30524, 30524])

'<sos> what is your name? <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'