In [1]:
from pathlib import Path
import re,string
import numpy as np

In [2]:
path = Path('../data')

In [3]:
corpus = (path/'examples.utf').open().readlines()
corpus[:5]

['A: ムーリエルは２０歳になりました。\tMuiriel is 20 now.#ID=1282_4707\n',
 'B: は 二十歳(はたち){２０歳} になる[01]{になりました}\n',
 'A: すぐに戻ります。\tI will be back soon.#ID=1284_4709\n',
 'B: 直ぐに{すぐに} 戻る{戻ります}\n',
 'A: すぐに諦めて昼寝をするかも知れない。\tI may give up soon and just nap instead.#ID=1300_4727\n']

In [4]:
def make_corpus(corpus_path):
    corpus = corpus_path.open().readlines()
    en,ja = [],[]
    pat = r'#ID.+\n'
    for c in corpus:
        if 'A: ' in c:
            clean_c = c.replace('A: ','')
            res = re.search(pat,clean_c)
            clean_c = clean_c.replace(res.group(0),'').split('\t')
            ja.append(clean_c[0])
            en.append(clean_c[1])
    return en,ja

In [5]:
en, ja = make_corpus(path/'examples.utf')
en[:2],ja[:2]

(['Muiriel is 20 now.', 'I will be back soon.'],
 ['ムーリエルは２０歳になりました。', 'すぐに戻ります。'])

In [6]:
import MeCab

In [7]:
tagger = MeCab.Tagger('-Owakati')

def ja_tokenizer(text):
    result = tagger.parse(text)
    words = result.split()
    if len(words) ==0: return []
    if words[-1] == '\n':return words[:-1]
    return words

In [8]:
ja_tokenizer(ja[0])

['ムーリエル', 'は', '２', '０', '歳', 'に', 'なり', 'まし', 'た', '。']

In [9]:
import spacy
from spacy.symbols import ORTH

In [10]:
en_tok = spacy.load('en')

In [11]:
def en_tokenizer(text):
    text = text.lower()
    return [t.text for t in en_tok.tokenizer(text)]

In [12]:
en_tokenizer(en[0])

['muiriel', 'is', '20', 'now', '.']

In [13]:
en_toks = [en_tokenizer(text) for text in en]
ja_toks = [ja_tokenizer(text) for text in ja]
en_toks[:2], ja_toks[:2]

([['muiriel', 'is', '20', 'now', '.'],
  ['i', 'will', 'be', 'back', 'soon', '.']],
 [['ムーリエル', 'は', '２', '０', '歳', 'に', 'なり', 'まし', 'た', '。'],
  ['すぐ', 'に', '戻り', 'ます', '。']])

In [14]:
len(en_toks), len(ja_toks)

(149786, 149786)

In [15]:
from collections import Counter,defaultdict

In [16]:
def numericalize_tok(tokens, max_vocab=50000, min_freq=0, unk_tok="_unk_", pad_tok="_pad_", bos_tok="_bos_", eos_tok="_eos_"):
    if isinstance(tokens, str):
        raise ValueError("Expected to receive a list of tokens. Received a string instead")
    if isinstance(tokens[0], list):
        tokens = [p for o in tokens for p in o]
    freq = Counter(tokens)
    int2tok = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
    unk_id = 3
    int2tok.insert(0, bos_tok)
    int2tok.insert(1, pad_tok)
    int2tok.insert(2, eos_tok)
    int2tok.insert(unk_id, unk_tok)
    tok2int = defaultdict(lambda:unk_id, {v:k for k,v in enumerate(int2tok)})
    return int2tok, tok2int

In [19]:
int2j,j2int = numericalize_tok(ja_toks)
int2en,en2int = numericalize_tok(en_toks)

In [20]:
len(int2j), len(int2en)

(31813, 21393)

In [17]:
import pickle

In [22]:
pickle.dump(int2j,(path/'int2j.pkl').open('wb'))
pickle.dump(int2en,(path/'int2en.pkl').open('wb'))

In [18]:
int2j = pickle.load((path/'int2j.pkl').open('rb'))
int2en = pickle.load((path/'int2en.pkl').open('rb'))
j2int = defaultdict(lambda:3, {v:k for k,v in enumerate(int2j)})
en2int = defaultdict(lambda:3, {v:k for k,v in enumerate(int2en)})

In [19]:
len(int2j), len(int2en)

(31813, 21393)

In [20]:
j_ids = np.array([[0]+[j2int[o] for o in sent]+[2] for sent in ja_toks])
en_ids = np.array([[0]+[en2int[o] for o in sent]+[2] for sent in en_toks])
len(j_ids),len(en_ids), j_ids[10],en_ids[10]

(149786,
 149786,
 [0,
  48,
  6,
  4891,
  5,
  109,
  11,
  143,
  10,
  83,
  8,
  57,
  86,
  1798,
  7,
  2146,
  232,
  255,
  47,
  36,
  4,
  2],
 [0, 114, 2251, 107, 38, 97, 85, 2649, 77, 28, 356, 4, 2])

In [31]:
np.random.seed(42)
trn_keep = np.random.rand(len(en_ids))>0.1
en_trn,j_trn = en_ids[trn_keep],j_ids[trn_keep]
en_val,j_val = en_ids[~trn_keep],j_ids[~trn_keep]
len(en_trn),len(en_val)

(134775, 15011)

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd.variable import Variable
from torch.utils.data import Dataset,DataLoader

In [23]:
from numpy import array as A

In [24]:
class Seq2SeqDataset(Dataset):
    def __init__(self, x, y): self.x,self.y = x,y
    def __getitem__(self, idx): return A(self.x[idx]), A(self.y[idx])
    def __len__(self): return len(self.x)

In [32]:
trn_ds = Seq2SeqDataset(en_trn,j_trn)
val_ds = Seq2SeqDataset(en_val,j_val)

bs = 120

trn_dl = DataLoader(trn_ds,batch_size=bs,shuffle=True)
val_dl = DataLoader(val_ds,batch_size=bs)

In [33]:
x, y = next(iter(val_dl))
x.size(), y.size()

(torch.Size([120, 57]), torch.Size([120, 70]))

In [26]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [27]:
max(len(o) for o in en_ids), max(len(o) for o in j_ids)

(56, 70)

In [28]:
j_ids = pad_sequences(j_ids, maxlen=70, dtype='int32', padding='post', truncating='post', value=1)
en_ids = pad_sequences(en_ids, maxlen=57, dtype='int32', padding='post', truncating='post', value=1)

In [29]:
## load fasttext vectors
import io
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    header = fin.readline().split()
    n, d = int(header[0]), int(header[1])
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(tokens[1:], dtype=float)
    return data, n, d

In [33]:
# get word vectors
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
#!unzip wiki-news-300d-1M.vec.zip
#!gunzip cc.ja.300.vec.gz
#!mv wiki-news-300d-1M.vec ../data/
#!mv cc.ja.300.vec ../data/

In [30]:
en_vecs,_,dim_en_vec = load_vectors('../data/wiki-news-300d-1M.vec')
j_vecs,_,dim_j_vec = load_vectors('../data/cc.ja.300.vec')

In [34]:
def create_emb(vecs, itos, em_sz):
    emb = nn.Embedding(len(itos), em_sz, padding_idx=1)
    wgts = emb.weight.data
    miss = []
    for i,w in enumerate(itos):
        try: wgts[i] = torch.from_numpy(vecs[w])
        except: miss.append(w)
    print('Number of unknowns in data: {}'.format(len(miss)))
    return emb
    

In [35]:
def V(tensor,req_grad=True):
    if torch.cuda.is_available():return Variable(tensor.cuda())
    else: return Variable(tensor)

In [36]:
class Seq2Seq(nn.Module):
    def __init__(self,en_vecs,int2en,j_vecs,int2j,em_sz,nh=128,out_sl=25,dropf=1,nl=2):
        super().__init__()
        #encoder
        self.nl,self.nh,self.em_sz,self.out_sl = nl,nh,em_sz,out_sl
        self.emb_enc = create_emb(en_vecs,int2en,dim_en_vec)
        self.emb_drop = nn.Dropout(0.15*dropf)
        self.encoder = nn.GRU(dim_en_vec,nh,num_layers=nl,dropout=0.25*dropf, bidirectional=True)
        #decoder
        self.emb_dec = create_emb(j_vecs,int2j,dim_j_vec)
        self.decoder = nn.GRU(dim_en_vec,nh*2,num_layers=nl,dropout=0.25*dropf)
        self.out_drop = nn.Dropout(0.35*dropf)
        self.out = nn.Linear(nh*2,len(int2j))
    
    def forward(self,inp,y=None):
        sl, bs = inp.size()
        emb_in = self.emb_drop(self.emb_enc(inp))
        h_n = self.initHidden(bs)
        enc_out, h_n = self.encoder(emb_in,h_n)
        h_n = h_n.view(2,2,bs,-1).permute(0,2,1,3).contiguous().view(self.nl,bs,-1)
        
        dec_inp = V(torch.zeros(bs).long())
        res = []
        for i in range(self.out_sl):
            dec_emb = self.emb_dec(dec_inp)
            outp,h_n = self.decoder(dec_emb.unsqueeze(0),h_n)
            outp = self.out(self.out_drop(outp[0]))
            res.append(outp)
            dec_inp = outp.data.max(1)[1]
            if (dec_inp==1).all(): break
        return torch.stack(res)
        
    def initHidden(self,bs):
        return V(torch.zeros([self.nl*2,bs,self.nh]))

In [37]:
seq2seq = Seq2Seq(en_vecs,int2en,j_vecs,int2j,dim_en_vec)
seq2seq.cuda()

Number of unknowns in data: 974
Number of unknowns in data: 492


Seq2Seq(
  (emb_enc): Embedding(21393, 300, padding_idx=1)
  (emb_drop): Dropout(p=0.15)
  (encoder): GRU(300, 128, num_layers=2, dropout=0.25, bidirectional=True)
  (emb_dec): Embedding(31813, 300, padding_idx=1)
  (decoder): GRU(300, 256, num_layers=2, dropout=0.25)
  (out_drop): Dropout(p=0.35)
  (out): Linear(in_features=256, out_features=31813, bias=True)
)

In [38]:
out = seq2seq(V(x.transpose(1,0).long()))
out.size()

torch.Size([25, 120, 31813])

In [39]:
def seq2seq_loss(input, target):
    sl,bs = target.size()
    sl_in,bs_in,nc = input.size()
    if sl>sl_in: input = F.pad(input, (0,0,0,0,0,sl-sl_in))
    input = input[:sl]
    return F.cross_entropy(input.view(-1,nc), target.view(-1))

In [40]:
seq2seq_loss(out,V(y.transpose(1,0).long()))

tensor(10.3639, device='cuda:0', grad_fn=<NllLossBackward>)

In [41]:
def step(x, y, epoch, m, crit, opt, clip=None):
    output = m(x, y)
    opt.zero_grad()
    loss = crit(output, y)
    loss.backward()
    if clip:
        nn.utils.clip_grad_norm_(m.parameters(), clip)
    opt.step()
    return loss.data.item()

In [42]:
from tqdm import tqdm

In [43]:
def train(trn_dl,val_dl,model,crit,opt,epochs=10,clip=None):
    for epoch in range(epochs):
        loss_val = loss_trn = 0
        with tqdm(total=len(trn_dl)) as pbar:
            model.train()
            for i, ds in enumerate(trn_dl):
                x, y = ds
                x, y = x.transpose(1,0), y.transpose(1,0)
                loss = step(V(x.long()),V(y.long()),epoch,model,crit,opt)
                loss_trn += loss
                pbar.update()
        model.eval()
        for i, ds in enumerate(val_dl):
            with torch.no_grad():
                x, y = ds
                x, y = x.transpose(1,0), y.transpose(1,0)
                out = model(V(x.long()))
                loss_val+= crit(out, V(y.long()))
                #loss_val +=loss
        print(f'Epoch: {epoch} trn loss: {loss_trn/len(trn_dl)} val loss: {loss_val/len(val_dl)}')

In [44]:
from torch import optim

In [92]:
opt = optim.Adam(seq2seq.parameters(),lr=3e-3,betas=(0.7,0.8))

In [67]:
train(trn_dl,val_dl,seq2seq,seq2seq_loss,opt,epochs=20)

100%|██████████| 2106/2106 [04:27<00:00,  7.90it/s]
  0%|          | 1/2106 [00:00<04:41,  7.49it/s]

Epoch: 0 trn loss: 5.353990527639362 val loss: 4.235739231109619


100%|██████████| 2106/2106 [04:29<00:00,  7.89it/s]
  0%|          | 1/2106 [00:00<04:42,  7.45it/s]

Epoch: 1 trn loss: 5.3160017594086595 val loss: 4.451933860778809


100%|██████████| 2106/2106 [04:34<00:00,  7.67it/s]
  0%|          | 1/2106 [00:00<04:48,  7.31it/s]

Epoch: 2 trn loss: 5.301374776637339 val loss: 3.4797050952911377


100%|██████████| 2106/2106 [04:33<00:00,  7.75it/s]
  0%|          | 1/2106 [00:00<04:41,  7.48it/s]

Epoch: 3 trn loss: 5.27146399123037 val loss: 3.608781576156616


100%|██████████| 2106/2106 [04:31<00:00,  7.58it/s]
  0%|          | 1/2106 [00:00<04:43,  7.41it/s]

Epoch: 4 trn loss: 5.246326666385473 val loss: 3.31687068939209


100%|██████████| 2106/2106 [04:31<00:00,  7.72it/s]


Epoch: 5 trn loss: 5.223315402879561 val loss: 3.676764488220215


In [45]:
def produce_out(val_dl, model,int2en,int2j,interval=(20,30)):
    model.eval()
    x,y = next(iter(val_dl))
    probs = seq2seq(V(x.long()))
    preds = A(probs.max(2)[1])
    for i in range(interval[0],interval[1]):
        print(' '.join([int2en[o] for o in x[i,:] if o not in [0,1,2]]))
        print(''.join([int2j[o] for o in y[i,:] if o not in [0,1,2]]))
        print(''.join([int2j[o] for o in preds[:,i] if o not in [0,1,2]]))
        print()

In [113]:
produce_out(val_dl,seq2seq,int2en,int2j)

the contents of the four registers are preserved by the called subroutine .
4つのレジスタは、呼び出された関数側が保存する。
このははのににに。。。

that was when i was in the first year at high school , so 17 years have passed since then .
それが高1の時だから17年が経ちました。
彼女ののて。。

the siren sounded an emergency .
サイレンが急変を知らせました。
どうぞは。

the daughter was irritated with her mother , who always broke her promises .
娘はいつも、約束を守らない母親に苛立っていた。
彼ははが。。。

however the disciples awoke to that danger .
しかし、使徒たちはその危険に気付いた。
そのののののの、、、、、、、、。。。

no consideration is paid to people who are sensitive to chemicals .
化学物質に敏感な人々への配慮がない。
誰、、、、、、、、、、、、、、、、、、、。

that 's just standard practise , it 's not like they 're cutting corners .
それは定石通りというだけで、手を抜いたわけではないのです。
そのののをを。。。

until manet painted this picture , his female nudes were limited to goddesses .
マネがこの絵を描くまで、女性の裸像は女神に限られていました。
そのははのは。。。

thinking about those sorts of things , i watched " duck soup " again .
そんなことを考えながら『我輩はカモである』を再見しました。
そのははははは。。。

for a display where the data items increase and decr



In [46]:
import math,random

def rand_t(*sz): return torch.randn(sz)/math.sqrt(sz[0])
def rand_p(*sz): return nn.Parameter(rand_t(*sz))

In [47]:
class Seq2SeqAttention(nn.Module):
    def __init__(self,en_vecs,int2en,j_vecs,int2j,em_sz,nh=128,out_sl=25,dropf=1,nl=2):
        super().__init__()
        #encoder
        self.nl,self.nh,self.em_sz,self.out_sl = nl,nh,em_sz,out_sl
        self.emb_enc = create_emb(en_vecs,int2en,dim_en_vec)
        self.emb_drop = nn.Dropout(0.15*dropf)
        self.encoder = nn.GRU(dim_en_vec,nh,num_layers=nl,dropout=0.25*dropf, bidirectional=True)
        #decoder
        self.emb_dec = create_emb(j_vecs,int2j,dim_j_vec)
        self.decoder = nn.GRU(dim_en_vec,nh*2,num_layers=nl,dropout=0.25*dropf)
        self.out_drop = nn.Dropout(0.35*dropf)
        self.out = nn.Linear(nh*2,len(int2j))
        #attention layer
        self.W1 = rand_p(nh*2, nh*2) #parameter
        self.l2 = nn.Linear(nh*2, nh*2)
        self.l3 = nn.Linear(dim_en_vec+nh*2, dim_en_vec)
        self.V = rand_p(nh*2) #parameter
    
    def forward(self,inp,y=None):
        sl, bs = inp.size()
        emb_in = self.emb_drop(self.emb_enc(inp))
        h_n = self.initHidden(bs)
        enc_out, h_n = self.encoder(emb_in,h_n)
        h_n = h_n.view(2,2,bs,-1).permute(0,2,1,3).contiguous().view(self.nl,bs,-1)
        
        dec_inp = V(torch.zeros(bs).long())
        res = []
        #multiply by parameter
        w1e = enc_out @ self.W1
        for i in range(self.out_sl):
            #linear layer
            w2h = self.l2(h_n[-1])
            #non-linear activation
            u = torch.tanh(w1e + w2h)
            #multiply by parameter
            a = F.softmax(u @ self.V, 0)
            ##multiply by parameter
            Xa = (a.unsqueeze(2) * enc_out).sum(0)
            dec_emb = self.emb_dec(dec_inp)
            #linear layer
            wgt_enc = self.l3(torch.cat([dec_emb, Xa], 1))
            outp,h_n = self.decoder(wgt_enc.unsqueeze(0),h_n)
            outp = self.out(self.out_drop(outp[0]))
            res.append(outp)
            dec_inp = outp.data.max(1)[1]
            if (random.random() > 0.5) and y is not None: dec_inp=y[i] 
            if (dec_inp==1).all(): break
        return torch.stack(res)
        
    def initHidden(self,bs):
        return V(torch.zeros([self.nl*2,bs,self.nh]))

In [48]:
seq2seq = Seq2SeqAttention(en_vecs,int2en,j_vecs,int2j,dim_en_vec)
seq2seq.cuda()

Number of unknowns in data: 974
Number of unknowns in data: 492


Seq2SeqAttention(
  (emb_enc): Embedding(21393, 300, padding_idx=1)
  (emb_drop): Dropout(p=0.15)
  (encoder): GRU(300, 128, num_layers=2, dropout=0.25, bidirectional=True)
  (emb_dec): Embedding(31813, 300, padding_idx=1)
  (decoder): GRU(300, 256, num_layers=2, dropout=0.25)
  (out_drop): Dropout(p=0.35)
  (out): Linear(in_features=256, out_features=31813, bias=True)
  (l2): Linear(in_features=256, out_features=256, bias=True)
  (l3): Linear(in_features=556, out_features=300, bias=True)
)

In [49]:
opt = optim.Adam(seq2seq.parameters(),lr=3e-3,betas=(0.7,0.8))

In [50]:
train(trn_dl,val_dl,seq2seq,seq2seq_loss,opt,epochs=20)

100%|██████████| 1124/1124 [04:13<00:00,  5.16it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 0 trn loss: 7.638076941313693 val loss: 7.885180950164795


100%|██████████| 1124/1124 [04:22<00:00,  5.01it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 1 trn loss: 7.432089324947778 val loss: 7.711851119995117


100%|██████████| 1124/1124 [04:22<00:00,  4.91it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 2 trn loss: 7.407120456899188 val loss: 7.721337795257568


100%|██████████| 1124/1124 [04:21<00:00,  5.16it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 3 trn loss: 7.410874541968213 val loss: 7.796937465667725


100%|██████████| 1124/1124 [04:24<00:00,  5.12it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 4 trn loss: 7.417057778063194 val loss: 7.853030681610107


100%|██████████| 1124/1124 [04:24<00:00,  4.79it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 5 trn loss: 7.416791848440612 val loss: 7.804213523864746


100%|██████████| 1124/1124 [04:20<00:00,  4.89it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 6 trn loss: 7.417439313970003 val loss: 7.706363201141357


100%|██████████| 1124/1124 [04:21<00:00,  5.07it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 7 trn loss: 7.417343817147496 val loss: 7.669027328491211


100%|██████████| 1124/1124 [04:21<00:00,  5.13it/s]
  0%|          | 0/1124 [00:00<?, ?it/s]

Epoch: 8 trn loss: 7.42836168567481 val loss: 7.68925142288208


  8%|▊         | 92/1124 [00:21<04:01,  4.28it/s]


KeyboardInterrupt: 

In [51]:
produce_out(val_dl,seq2seq,int2en,int2j)

the contents of the four registers are preserved by the called subroutine .
4つのレジスタは、呼び出された関数側が保存する。
、、、、、、、、、、、、、ののか、、

that was when i was in the first year at high school , so 17 years have passed since then .
それが高1の時だから17年が経ちました。
は

the siren sounded an emergency .
サイレンが急変を知らせました。
ははがが。

the daughter was irritated with her mother , who always broke her promises .
娘はいつも、約束を守らない母親に苛立っていた。
１０に、をををているのををているのです。

however the disciples awoke to that danger .
しかし、使徒たちはその危険に気付いた。
ははののでは、ののををているのだ。

no consideration is paid to people who are sensitive to chemicals .
化学物質に敏感な人々への配慮がない。
人ががているのををするのは、のはををている。

that 's just standard practise , it 's not like they 're cutting corners .
それは定石通りというだけで、手を抜いたわけではないのです。
、、、にに、、、、、、、、、、、、、、、、、、、

until manet painted this picture , his female nudes were limited to goddesses .
マネがこの絵を描くまで、女性の裸像は女神に限られていました。
、、にに、、、、、、、、、、、、、、、、、、、、

thinking about those sorts of things , i watched " duck soup " again .
そんなことを考えながら『我輩はカモである』を再見しました。
ははをををているのををている。

for