In [28]:
import math
import time
from collections import Counter
import random
import jieba
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence,pad_packed_sequence

In [29]:
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1d76f1dbe30>

In [30]:
BATCH_SIZE = 4
MAX_VOCAB_SIZE = 30000
PAD_IDX = 0
UNK_IDX = 1
EMB_DIM = 256
HIDDEN_SIZE = 512
DROPOUT = 0.5
EPOCHS = 10

In [31]:
def load_data(in_file):
    contexts = []
    targets = []
    with open(in_file, 'r', encoding='utf-8') as f:
        for line in f:
            context, target = line.strip().split('\t') # context 和target是string
            context_list = nltk.word_tokenize(context) # 用nltk分词
            context_list = [token.lower() for token in context_list] # 变小写
            target_list = list(jieba.cut(target, cut_all=False, HMM=True))
            contexts.append(["<bos>"] + context_list + ["<eos>"])
            targets.append(["<bos>"] + target_list + ["<eos>"])
    return contexts,targets

In [32]:
file_path = "./cmn.txt"
contexts,targets= load_data(file_path)

In [33]:
def build_voc(sentences, max_words=MAX_VOCAB_SIZE):
    word_count = Counter()
    for sentence in sentences:
        for s in sentence:
            word_count[s] += 1
    vocab_dict = dict(word_count.most_common(max_words))
    #most_common(n) 按照counter的计数，按照降序，返回前max_words项组成的list，比总单词数大时返回全部项。
    total_wds = len(vocab_dict)+2                #加上<unk>和<pad>
    id2wd = [w for w in vocab_dict.keys()]       #单词构成的list
    wd2id = {}
    for i,w in enumerate(id2wd):
        wd2id[w] = i + 2                              #单词:id +2 构成的dict
    wd2id["<unk>"] = UNK_IDX
    wd2id["<pad>"] = PAD_IDX
    id2wd = id2wd+["<unk>","<pad>"]
    return id2wd,wd2id,total_wds
id2en,en2id,total_en = build_voc(contexts)
id2cn,cn2id,total_cn = build_voc(targets)

In [34]:
class MyDataset(Dataset):
    def __init__(self, contexts, targets, en2id, cn2id, sort_by_len=True):
        super().__init__()
        self.en2id = en2id
        self.cn2id = cn2id
        self.sort_by_len = sort_by_len

        self.contexts = [torch.LongTensor([en2id.get(w, 0) for w in sent]) for sent in contexts]
        self.targets = [torch.LongTensor([cn2id.get(w, 0) for w in sent]) for sent in targets]

        if self.sort_by_len:
            self.sorted_idx = sorted(range(len(self.contexts)), key=lambda x: len(self.contexts[x]))
            self.contexts = [self.contexts[i] for i in self.sorted_idx]
            self.targets = [self.targets[i] for i in self.sorted_idx]

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self,idx):
        context = self.contexts[idx]
        target = self.targets[idx]
        context_len = torch.numel(self.contexts[idx])
        target_len = torch.numel(self.targets[idx])
        return {'context': context, 'target': target, 'context_len': context_len, 'target_len': target_len}
def my_collate_fn(batch):
    #the input（batch）to collate_fn is a list of with size batch_size.Each element is a dict where elements in order are context,target,context_len,target_len.
    res_batch = {}
    for key in batch[0]:
        batch_tensor = [d[key] for d in batch]
        if key == "context" or key =="target":
            batch_tensor = pad_sequence(batch_tensor,batch_first=True)
        res_batch[key] = batch_tensor
    return res_batch
full_dataset = MyDataset(contexts,targets,en2id,cn2id)
train_size = int(0.6 * len(full_dataset))
valid_size = int(0.2 * len(full_dataset))
test_size = len(full_dataset) - train_size - valid_size
train_dataset,valid_dataset,test_dataset = torch.utils.data.random_split(full_dataset, [train_size,valid_size,test_size])
train_iterator = DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=my_collate_fn)
valid_iterator = DataLoader(dataset=valid_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=my_collate_fn)
test_iterator = DataLoader(dataset=test_dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=my_collate_fn)

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [36]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_len):
        
        #src = [src sent len, batch size]
        #src_len = [src sent len]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src sent len, batch size, emb dim]
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len,enforce_sorted=False)
        
        packed_outputs, hidden = self.rnn(packed_embedded)
                     
        #packed_outputs is a packed sequence containing all hidden states
        #hidden is now from the final non-padded element in the batch
            
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
            
        #outputs is now a non-packed sequence, all hidden states obtained
        #  when the input is a pad token are all zeros
            
        #outputs = [sent len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        #hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
        #outputs are always from the last layer
        
        #hidden [-2, :, : ] is the last of the forwards RNN 
        #hidden [-1, :, : ] is the last of the backwards RNN
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [sent len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [37]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))
        
    def forward(self, hidden, encoder_outputs, mask):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        #mask = [batch size, src sent len]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat encoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src sent len, dec hid dim]
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src sent len, dec hid dim]
                
        energy = energy.permute(0, 2, 1)
        
        #energy = [batch size, dec hid dim, src sent len]
        
        #v = [dec hid dim]
        
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        
        #v = [batch size, 1, dec hid dim]
            
        attention = torch.bmm(v, energy).squeeze(1)
        
        #attention = [batch size, src sent len]
        
        attention = attention.masked_fill(mask == 0, -1e10)
        
        return F.softmax(attention, dim = 1)

In [38]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src sent len, batch size, enc hid dim * 2]
        #mask = [batch size, src sent len]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs, mask)
                
        #a = [batch size, src sent len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src sent len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src sent len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [sent len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #sent len, n layers and n directions will always be 1 in this decoder, therefore:
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        #this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        output = self.out(torch.cat((output, weighted, embedded), dim = 1))
        
        #output = [bsz, output dim]
        
        return output, hidden.squeeze(0), a.squeeze(1)

In [39]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_idx, sos_idx, eos_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.device = device
        
    def create_mask(self, src):
        mask = (src != self.pad_idx).permute(1, 0)
        return mask
        
    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src sent len, batch size]
        #src_len = [batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use teacher forcing 75% of the time
        
        if trg is None:
            assert teacher_forcing_ratio == 0, "Must be zero during inference"
            inference = True
            trg = torch.zeros((100, src.shape[1])).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #tensor to store attention
        attentions = torch.zeros(max_len, batch_size, src.shape[0]).to(self.device)
        
        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src, src_len)
                
        #first input to the decoder is the <sos> tokens
        output = trg[0,:]
        
        mask = self.create_mask(src)
                
        #mask = [batch size, src sent len]
                
        for t in range(1, max_len):
            output, hidden, attention = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            attentions[t] = attention
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)
            if inference and output.item() == self.eos_idx:
                return outputs[:t], attentions[:t]
            
        return outputs, attentions

In [40]:
INPUT_DIM = total_en
OUTPUT_DIM = total_cn
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
PAD_IDX = 0
BOS_IDX = 2
EOS_IDX = 3

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, PAD_IDX, BOS_IDX, EOS_IDX, device).to(device)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [43]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch["context"]
        # src = [batch size,src sent len]
        src = torch.transpose(src, 0, 1)
        # src = [src sent len,batch size]
        src_len = torch.LongTensor(batch["context_len"])
        # src_len = [batch size]
        trg = batch["target"]
        trg = torch.transpose(trg,0,1)
        # trg = [trg sent len,batch size]
        
        optimizer.zero_grad()
        
        output, attetion = model(src, src_len, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].reshape(-1, output.shape[-1])
        trg = trg[1:].reshape(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [44]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch["context"]
            # src = [batch size,src sent len]
            src = torch.transpose(src, 0, 1)
            # src = [src sent len,batch size]
            src_len = torch.LongTensor(batch["context_len"])
            # src_len = [batch size]
            trg = batch["target"]
            trg = torch.transpose(trg,0,1)
            # trg = [trg sent len,batch size]

            output, attention = model(src, src_len, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].reshape(-1, output.shape[-1])
            trg = trg[1:].reshape(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [45]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [47]:
N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

KeyboardInterrupt: 

In [48]:
def translate_sentence(model, sentence):
    model.eval()
    tokenized = nltk.word_tokenize(sentence)
    tokenized = ["<bos>"] + [token.lower() for token in tokenized] + ["<eos>"]
    numericalized = [en2id.get(w,1) for w in tokenized]
    sentence_length = torch.LongTensor([len(numericalized)]).to(device) 
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device) 
    # tensor = [sen len,1]
    translation_tensor_logits, attention = model(tensor, sentence_length, None, 0) 
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    # translation_tensor = [trg len]
    translation = [id2cn[id]for id in translation_tensor]
    translation, attention = translation[1:], attention[1:]
    return translation

In [49]:
sentence = "I can drive a car."
print(translate_sentence(model,sentence))

['了', '该', '他', '的']
