In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# import spacy

import random
import math
import time

import collections
import pickle
import os
import tensorflow as tf
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
import torch.nn.utils.rnn as rnn_utils

In [2]:
data_dir = './data/'
proj_dir = './'
class Vocabulary(object):

    def __init__(self):
        self.word2idx = {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}
        self.idx2word = {0: '<SOS>', 1: '<EOS>', 2: '<PAD>', 3: '<UNK>'}
        self.num_words = 4
        self.OOV_list = []
        self.OOV=0      

    def build_vocab(self, data_path):
        """Construct the relation between words and indices"""
        i=0
        with open(data_path, 'r', encoding='utf-8') as dataset:
            for words in dataset:
                parts = words.strip('\n').strip().split('\t')
                assert len(parts) == 2, 'Not 2 parts'+words
                words = parts[0].split()
                if i<5:
                    print(words)
                i+=1
                for word in words:
                    if word not in self.word2idx:
                        self.word2idx[word] = self.num_words
                        self.idx2word[self.num_words] = word
                        self.num_words += 1

    def sequence_to_indices(self, sequence, add_eos=False, add_sos=False):
        """Transform a char sequence to index sequence
            :param sequence: a string composed with chars
            :param add_eos: if true, add the <EOS> tag at the end of given sentence
            :param add_sos: if true, add the <SOS> tag at the beginning of given sentence
        """
        index_sequence = [self.word2idx['<SOS>']] if add_sos else []

        for word in sequence:
            if word not in self.word2idx:
                self.OOV+=1
                self.OOV_list.append(word)
                index_sequence.append((self.word2idx['<UNK>']))
            else:
                index_sequence.append(self.word2idx[word])

        if add_eos:
            index_sequence.append(self.word2idx['<EOS>'])

        return index_sequence

    def indices_to_sequence(self, indices,print_signal=False):
        """Transform a list of indices
            :param indices: a list
        """
        sequence = []
        for idx in indices:
            word = self.idx2word[idx]
            if word == "<EOS>" and (not print_signal):
                sequence.append(word)
                break
            elif word == '<PAD>':
                break
            else:
                sequence.append(word)
        return sequence

    def __str__(self):
        str = "Vocab information:\n"
        for idx, word in self.idx2word.items():
            str += "word: %s Index: %d\n" % (word, idx)
        return str

In [10]:
class MyData(Dataset):
    def __init__(self, path,vocab):
        self.src_indices_seq = []
        self.trg_indices_seq = []
        self.vocab = vocab
        self.PAD_ID = self.vocab.word2idx["<PAD>"]
        self.SOS_ID = self.vocab.word2idx["<SOS>"]
        self.vocab_size = self.vocab.num_words
        self.max_length = -1
        self.raw_src_sent_data = [] #src sentence
        self.raw_trg_sent_data = [] #target sentence
#         self.max_length = self.vocab.max_length
        c=0
        file=open(path, 'r', encoding='utf-8')
        for line in file:
            pparts = line.strip('\n').split('\t')
            assert len(pparts) == 2, 'Error!!'
            words,trg_sentence = pparts
            trg_sentence = trg_sentence.split()
            self.raw_trg_sent_data.append(trg_sentence)
            if trg_sentence[0]!='<SOS>':
                print('trg:',trg_sentence)
            if self.max_length < len(trg_sentence):
                self.max_length = len(trg_sentence)
            src_sent = words.strip('\n').split()
            self.raw_src_sent_data.append(src_sent)
            if src_sent[0]!='<SOS>':
                print('src:',src_sent)
            if self.max_length < len(src_sent):
                self.max_length = len(src_sent)
        assert len(self.raw_trg_sent_data)==len(self.raw_src_sent_data),'Error 2!'+line
        for i,trg_sent in enumerate(self.raw_trg_sent_data):
            indices_seq = self.vocab.sequence_to_indices(self.raw_src_sent_data[i], add_eos=False)
            self.src_indices_seq.append(torch.tensor(indices_seq))
            indices_seq = self.vocab.sequence_to_indices(trg_sent, add_eos=False)
            self.trg_indices_seq.append(torch.tensor(indices_seq))
        self.src_indices_seq = rnn_utils.pad_sequence(self.src_indices_seq, batch_first=True, padding_value=self.PAD_ID)
        self.trg_indices_seq = rnn_utils.pad_sequence(self.trg_indices_seq, batch_first=True, padding_value=self.PAD_ID)
        print("## J: Total examples: %d, unique words:%d, Max seq length: %d"%(len(self.src_indices_seq),self.vocab_size,self.max_length))
    # def collate_fn(data):
    #     data = rnn_utils.pad_sequence(data, batch_first=True, padding_value=0)
    #     return data
    def __len__(self):
        return len(self.src_indices_seq)

    def __getitem__(self, idx):
        return torch.tensor(self.src_indices_seq[idx]), torch.tensor(self.trg_indices_seq[idx])


In [4]:
device = torch.device('cuda:1')

In [5]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        # self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers,batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.layerN = nn.LayerNorm(emb_dim)
        
    def forward(self, src):
        
        #src = [src sent len, batch size]
        
        embedded = self.dropout(self.layerN((self.embedding(src))))
#         embedded = self.dropout(src)
        
        #embedded = [src sent len, batch size, emb dim]
        
        # outputs, (hidden, cell) = self.rnn(embedded)
        outputs, hidden = self.gru(embedded)
        
        #outputs = [src sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        # self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers,batch_first=True)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.layerN_emb = nn.LayerNorm(emb_dim)
        self.layerN = nn.LayerNorm(hid_dim)
        self.dropout = nn.Dropout(dropout)
        self.dp_dense = nn.Dropout(0.1)
        
    def forward(self, input, hidden):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(1)
        
#         #input = [1, batch size]
        
        embedded = self.dropout(self.layerN_emb(self.embedding(input)))
#         embedded = self.dropout(input)
        
        #embedded = [1, batch size, emb dim]
                
        # output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        output, hidden = self.gru(embedded,hidden)
        
        #output = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #sent len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        prediction = self.layerN(output.squeeze(1))
        prediction = self.out(self.dp_dense(prediction))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
#         self.embedding = nn.Embedding(output_dim, emb_dim)
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[0]
        max_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
#         outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        outputs = torch.zeros(batch_size, max_len, trg_vocab_size).to(self.device)
        outputs_idx = torch.zeros(batch_size, max_len-1)

        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden = self.encoder(src)
#         embedded = self.embedding(src)
#         hidden = self.encoder(embedded)
        
        #first input to the decoder is the <sos> tokens
        input = trg[:,0]
        
        for t in range(1, max_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden= self.decoder(input, hidden)
#             input = input.unsqueeze(1)
#             embedded = self.embedding(input)
#             output, hidden= self.decoder(embedded, hidden)
            
            #place predictions in a tensor holding predictions for each token
            outputs[:,t,:] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1)
            outputs_idx[:,t-1] = top1
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[:,t] if teacher_force else top1
        
        return outputs,outputs_idx

In [7]:
from tqdm import tqdm, trange
def decode_string(output,dataset):
    results = []
    trg = []
    correct = 0
    for i,seq in enumerate(output):
        results.append(' '.join(['<SOS>']+dataset.vocab.indices_to_sequence(seq)))
        trg.append(' '.join(dataset.raw_trg_sent_data[i]))
        if results[-1] == trg[-1]:
            correct+=1
        
    print('-----exm-----')
    print(results[:10])
    print(trg[:10])
    return results, correct/len(output)

def evaluate(model,criterion,dataset):
    prediction = []
    model.eval()
    
    epoch_loss = 0
    data_loader = DataLoader(dataset, batch_size=512, shuffle=False)
    trange = tqdm(enumerate(data_loader), total=len(data_loader),desc='valid')
    for step, batch in trange:
        src = batch[0]
        trg = batch[1]
        src = src.to(device)
        trg = trg.to(device)
        with torch.no_grad():
            output,output_idx = model(src, trg, 0) #turn off teacher forcing
            prediction.append(output_idx.to('cpu'))

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[:,1:].reshape(-1, output.shape[-1])
            trg = trg[:,1:].reshape(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
    prediction = torch.cat(prediction).detach().numpy().astype(int)
    prediction,acc = decode_string(prediction,dataset)
    return epoch_loss / len(data_loader),prediction,acc

In [11]:
vocab = Vocabulary()
vocab_dic = pickle.load(open('./output_final/vocab_final.pkl','rb'))
vocab.word2idx = vocab_dic['word2idx']; vocab.idx2word = vocab_dic['idx2word']
vocab.num_words = vocab_dic['num_words']
model_load = torch.load(open('./output_final/model_final_9336.pkl','rb'))
test_set = MyData(data_dir+'final_data/val.txt',vocab=vocab)
criterion = nn.CrossEntropyLoss(ignore_index = test_set.PAD_ID)
valid_loss,valid_predict,valid_acc = evaluate(model_load, criterion,test_set)
print(len(valid_predict),valid_acc)
# f_out = open('./output/task2_1_1_sample_predictions.txt','w')
# for row in valid_predict:
#     f_out.write(row+'\n')
# f_out.close()

valid:   0%|          | 0/48 [00:00<?, ?it/s]

## J: Total examples: 24510, unique words:198, Max seq length: 19


valid: 100%|██████████| 48/48 [00:01<00:00, 28.19it/s]


-----exm-----
['<SOS> do , <EOS>', '<SOS> tom <EOS>', '<SOS> to mary so <EOS>', '<SOS> said would do , mary <EOS>', '<SOS> do so <EOS>', '<SOS> asked asked , do would said <EOS>', '<SOS> much time i <EOS>', '<SOS> do <EOS>', '<SOS> how more time <EOS>', '<SOS> i time <EOS>']
["<SOS> n't that <EOS>", '<SOS> tom <EOS>', '<SOS> mary i , <EOS>', '<SOS> said would do , i <EOS>', '<SOS> do , <EOS>', '<SOS> mary i , do would said <EOS>', '<SOS> much time i <EOS>', '<SOS> do <EOS>', '<SOS> how more do <EOS>', '<SOS> i time <EOS>']
24510 0.9336597307221542


In [33]:
print(len(valid_predict))
print(valid_acc)

98040
0.7814973480212158
