In [26]:
from pathlib import Path
import re,string

In [2]:
path = Path('../data')

In [20]:
corpus = (path/'examples.utf').open().readlines()
corpus[:5]

['A: ムーリエルは２０歳になりました。\tMuiriel is 20 now.#ID=1282_4707\n',
 'B: は 二十歳(はたち){２０歳} になる[01]{になりました}\n',
 'A: すぐに戻ります。\tI will be back soon.#ID=1284_4709\n',
 'B: 直ぐに{すぐに} 戻る{戻ります}\n',
 'A: すぐに諦めて昼寝をするかも知れない。\tI may give up soon and just nap instead.#ID=1300_4727\n']

In [21]:
def make_corpus(corpus_path):
    corpus = corpus_path.open().readlines()
    en,ja = [],[]
    pat = r'#ID.+\n'
    for c in corpus:
        if 'A: ' in c:
            clean_c = c.replace('A: ','')
            res = re.search(pat,clean_c)
            clean_c = clean_c.replace(res.group(0),'').split('\t')
            ja.append(clean_c[0])
            en.append(clean_c[1])
    return en,ja

In [22]:
en, ja = make_corpus(path/'examples.utf')
en[:2],ja[:2]

(['Muiriel is 20 now.', 'I will be back soon.'],
 ['ムーリエルは２０歳になりました。', 'すぐに戻ります。'])

In [23]:
import MeCab

In [43]:
tagger = MeCab.Tagger('-Owakati')

def ja_tokenizer(text):
    result = tagger.parse(text)
    words = result.split()
    if len(words) ==0: return []
    if words[-1] == '\n':return words[:-1]
    return words

In [45]:
ja_tokenizer(ja[0])

['ムーリエル', 'は', '２', '０', '歳', 'に', 'なり', 'まし', 'た', '。']

In [24]:
import spacy
from spacy.symbols import ORTH

In [None]:
en_tok = spacy.load('en')

In [47]:
def en_tokenizer(text):
    text = text.lower()
    return [t.text for t in en_tok.tokenizer(text)]

In [50]:
en_tokenizer(en[0])

list

In [52]:
en_toks = [en_tokenizer(text) for text in en]
ja_toks = [ja_tokenizer(text) for text in ja]
en_toks[:2], ja_toks[:2]

([['muiriel', 'is', '20', 'now', '.'],
  ['i', 'will', 'be', 'back', 'soon', '.']],
 [['ムーリエル', 'は', '２', '０', '歳', 'に', 'なり', 'まし', 'た', '。'],
  ['すぐ', 'に', '戻り', 'ます', '。']])

In [72]:
len(en_toks), len(ja_toks)

(149786, 149786)

In [60]:
from collections import Counter,defaultdict

In [61]:
def numericalize_tok(tokens, max_vocab=50000, min_freq=0, unk_tok="_unk_", pad_tok="_pad_", bos_tok="_bos_", eos_tok="_eos_"):
    if isinstance(tokens, str):
        raise ValueError("Expected to receive a list of tokens. Received a string instead")
    if isinstance(tokens[0], list):
        tokens = [p for o in tokens for p in o]
    freq = Counter(tokens)
    int2tok = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
    unk_id = 3
    int2tok.insert(0, bos_tok)
    int2tok.insert(1, pad_tok)
    int2tok.insert(2, eos_tok)
    int2tok.insert(unk_id, unk_tok)
    tok2int = defaultdict(lambda:unk_id, {v:k for k,v in enumerate(int2tok)})
    return int2tok, tok2int

In [66]:
int2j,j2int = numericalize_tok(ja_toks)
int2en,en2int = numericalize_tok(en_toks)

In [67]:
len(int2j), len(int2en)

(31813, 21393)

In [64]:
import pickle

In [68]:
pickle.dump(int2j,(path/'int2j.pkl').open('wb'))
pickle.dump(int2en,(path/'int2en.pkl').open('wb'))

In [69]:
int2j = pickle.load((path/'int2j.pkl').open('rb'))
int2en = pickle.load((path/'int2en.pkl').open('rb'))
j2int = defaultdict(lambda:3, {v:k for k,v in enumerate(int2j)})
en2int = defaultdict(lambda:3, {v:k for k,v in enumerate(int2en)})

In [70]:
len(int2j), len(int2en)

(31813, 21393)

In [129]:
j_ids = np.array([[0]+[j2int[o] for o in sent]+[2] for sent in ja_toks])
en_ids = np.array([[0]+[en2int[o] for o in sent]+[2] for sent in en_toks])
len(j_ids),len(en_ids), j_ids[10],en_ids[10]

(149786,
 149786,
 [0,
  48,
  6,
  4891,
  5,
  109,
  11,
  143,
  10,
  83,
  8,
  57,
  86,
  1798,
  7,
  2146,
  232,
  255,
  47,
  36,
  4,
  2],
 [0, 114, 2251, 107, 38, 97, 85, 2649, 77, 28, 356, 4, 2])

In [74]:
import numpy as np

In [131]:
np.random.seed(42)
trn_keep = np.random.rand(len(en_ids))>0.1
en_trn,j_trn = en_ids[trn_keep],j_ids[trn_keep]
en_val,j_val = en_ids[~trn_keep],j_ids[~trn_keep]
len(en_trn),len(en_val)

(134775, 15011)

In [171]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd.variable import Variable
from torch.utils.data import Dataset,DataLoader

In [89]:
from numpy import array as A

In [132]:
class Seq2SeqDataset(Dataset):
    def __init__(self, x, y): self.x,self.y = x,y
    def __getitem__(self, idx): return A(self.x[idx]), A(self.y[idx])
    def __len__(self): return len(self.x)

In [189]:
trn_ds = Seq2SeqDataset(en_trn,j_trn)
val_ds = Seq2SeqDataset(en_val,j_val)

bs = 120

trn_dl = DataLoader(trn_ds,batch_size=bs,shuffle=True)
val_dl = DataLoader(val_ds,batch_size=int(bs*1.6))

In [190]:
x, y = next(iter(trn_dl))
x.size(), y.size()

(torch.Size([120, 25]), torch.Size([120, 25]))

In [94]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [96]:
enlen_90 = int(np.percentile([len(o) for o in en_ids], 99))
jlen_90 = int(np.percentile([len(o) for o in j_ids], 97))

In [97]:
enlen_90,jlen_90

(23, 22)

In [130]:
j_ids = pad_sequences(j_ids, maxlen=25, dtype='int32', padding='post', truncating='post', value=1)
en_ids = pad_sequences(en_ids, maxlen=25, dtype='int32', padding='post', truncating='post', value=1)

In [106]:
## load fasttext vectors
import io
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    header = fin.readline().split()
    n, d = int(header[0]), int(header[1])
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:], dtype=float)
    return data, n, d

In [None]:
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
#!unzip wiki-news-300d-1M.vec.zip
#!gunzip cc.ja.300.vec.gz
#mv wiki-news-300d-1M.vec ../data/
#mv cc.ja.300.vec ../data/

In [107]:
en_vecs,_,dim_en_vec = load_vectors('../data/wiki-news-300d-1M.vec')
j_vecs,_,dim_j_vec = load_vectors('../data/cc.ja.300.vec')

In [108]:
def create_emb(vecs, itos, em_sz):
    emb = nn.Embedding(len(itos), em_sz, padding_idx=1)
    wgts = emb.weight.data
    miss = []
    for i,w in enumerate(itos):
        try: wgts[i] = torch.from_numpy(vecs[w])
        except: miss.append(w)
    print('Number of unknowns in data: {}'.format(len(miss)))
    return emb
    

In [172]:
def V(tensor):
    if torch.cuda.is_available():return Variable(tensor.cuda())
    else: return Variable(tensor)

In [175]:
class Seq2Seq(nn.Module):
    def __init__(self,en_vecs,int2en,j_vecs,int2j,em_sz,nh=128,out_sl=25,dropf=1,nl=2):
        super().__init__()
        #encoder
        self.nl,self.nh,self.em_sz,self.out_sl = nl,nh,em_sz,out_sl
        self.emb_enc = create_emb(en_vecs,int2en,dim_en_vec)
        self.emb_drop = nn.Dropout(0.15*dropf)
        self.encoder = nn.GRU(dim_en_vec,nh,num_layers=nl,dropout=0.25*dropf, bidirectional=True,batch_first=True)
        #decoder
        self.emb_dec = create_emb(j_vecs,int2j,dim_j_vec)
        self.decoder = nn.GRU(dim_en_vec,nh*2,num_layers=nl,dropout=0.25*dropf,batch_first=True)
        self.out_drop = nn.Dropout(0.35*dropf)
        self.out = nn.Linear(nh*2,len(int2j))
    
    def forward(self,inp,y=None):
        bs, sl = inp.size()
        emb_in = self.emb_drop(self.emb_enc(inp))
        h_n = self.initHidden(bs)
        enc_out, h_n = self.encoder(emb_in,h_n)
        h_n = h_n.view(2,2,bs,-1).permute(0,2,1,3).contiguous().view(self.nl,bs,-1)
        
        dec_inp = V(torch.zeros(bs).long())
        res = []
        for i in range(self.out_sl):
            dec_emb = self.emb_dec(dec_inp)
            _,h_n = self.decoder(dec_emb.unsqueeze(1),h_n)
            outp = self.out_drop(self.out(h_n[-1]))
            res.append(outp)
            dec_inp = outp.data.max(1)[1]
            if (dec_inp==1).all(): break
        return torch.stack(res)
        
    def initHidden(self,bs):
        return V(torch.zeros([self.nl*2,bs,self.nh]))

In [191]:
seq2seq = Seq2Seq(en_vecs,int2en,j_vecs,int2j,dim_en_vec)
seq2seq.cuda()

Number of unknowns in data: 974
Number of unknowns in data: 492


Seq2Seq(
  (emb_enc): Embedding(21393, 300, padding_idx=1)
  (emb_drop): Dropout(p=0.15)
  (encoder): GRU(300, 128, num_layers=2, batch_first=True, dropout=0.25, bidirectional=True)
  (emb_dec): Embedding(31813, 300, padding_idx=1)
  (decoder): GRU(300, 256, num_layers=2, batch_first=True, dropout=0.25)
  (out_drop): Dropout(p=0.35)
  (out): Linear(in_features=256, out_features=31813, bias=True)
)

In [178]:
out = seq2seq(V(x.long()))
out.size()

torch.Size([25, 64, 31813])

In [179]:
def seq2seq_loss(input, target):
    bs,sl = target.size()
    sl_in,bs_in,nc = input.size()
    if sl>sl_in: input = F.pad(input, (0,0,0,0,0,sl-sl_in))
    input = input[:sl]
    return F.cross_entropy(input.view(-1,nc), target.view(-1))

In [180]:
seq2seq_loss(out,V(y.long()))

tensor(10.3802, device='cuda:0', grad_fn=<NllLossBackward>)

In [182]:
def step(x, y, epoch, m, crit, opt, clip=None):
    output = m(x, y)
    opt.zero_grad()
    loss = crit(output, y)
    loss.backward()
    if clip:
        nn.utils.clip_grad_norm_(m.parameters(), clip)
    opt.step()
    return loss.data.item()

In [185]:
from tqdm import tqdm

In [196]:
def train(trn_dl,val_dl,model,crit,opt,epochs=10,clip=None):
    for epoch in range(epochs):
        with tqdm(total=len(trn_dl)) as pbar:
            model.train()
            loss_trn = loss_val = 0
            for i, ds in enumerate(trn_dl):
                x, y = ds
                loss = step(V(x.long()),V(y.long()),epoch,model,crit,opt)
                loss_trn += loss
                pbar.update()
        model.eval()
        for i, ds in enumerate(val_dl):
            x, y = ds
            out = model(V(x.long()))
            loss = crit(output, V(y.long()))
            loss_val +=loss
        print(f'Epoch: {epoch} trn loss: {loss_trn/trn_dl.batch_size} val loss: {loss_val/val_dl.batch_size}')

In [160]:
from torch import optim

In [193]:
opt = optim.Adam(seq2seq.parameters(),lr=3e-3,betas=(0.7,0.8))

In [None]:
train(trn_dl,val_dl,seq2seq,seq2seq_loss,opt,epochs=6)

 48%|████▊     | 545/1124 [01:29<01:35,  6.09it/s]

In [None]:
def produce_out(val_dl, model,int2en,int2j,interval=(20,30)):
    x,y = next(iter(val_dl))
    probs = seq2seq(V(x))
    preds = A(probs.max(2)[1])
    for i in range(interval[0],interval[1]):
        print(' '.join([int2en[o] for o in x[i,:] if o != 1]))
        print(' '.join([int2j[o] for o in y[i,:] if o != 1]))
        print(' '.join([int2j[o] for o in preds[i,:] if o!=1]))
        print()