In [72]:
import torch
import numpy as np

from torch.autograd import Variable


class Vocabulary(object):

    def __init__(self):
        self.char2idx = {'SOS': 0, 'EOS': 1, 'PAD': 2, 'UNK': 3}
        self.idx2char = {0: 'SOS', 1: 'EOS', 2: 'PAD', 3: 'UNK'}
        self.num_chars = 4
        self.max_length = 0
        self.word_list = []

    def build_vocab(self, data_path):
        """Construct the relation between words and indices"""
        with open(data_path, 'r', encoding='utf-8') as dataset:
            for word in dataset:
                word = word.strip('\n')

                self.word_list.append(word)
                if self.max_length < len(word):
                    self.max_length = len(word)

                chars = self.split_sequence(word)
                for char in chars:
                    if char not in self.char2idx:
                        self.char2idx[char] = self.num_chars
                        self.idx2char[self.num_chars] = char
                        self.num_chars += 1

    def sequence_to_indices(self, sequence, add_eos=False, add_sos=False):
        """Transform a char sequence to index sequence
            :param sequence: a string composed with chars
            :param add_eos: if true, add the <EOS> tag at the end of given sentence
            :param add_sos: if true, add the <SOS> tag at the beginning of given sentence
        """
        index_sequence = [self.char2idx['SOS']] if add_sos else []

        for char in self.split_sequence(sequence):
            if char not in self.char2idx:
                index_sequence.append((self.char2idx['UNK']))
            else:
                index_sequence.append(self.char2idx[char])

        if add_eos:
            index_sequence.append(self.char2idx['EOS'])

        return index_sequence

    def indices_to_sequence(self, indices):
        """Transform a list of indices
            :param indices: a list
        """
        sequence = ""
        for idx in indices:
            char = self.idx2char[idx]
            if char == "EOS":
                break
            else:
                sequence += char
        return sequence

    def split_sequence(self, sequence):
        """Vary from languages and tasks. In our task, we simply return chars in given sentence
        For example:
            Input : alphabet
            Return: [a, l, p, h, a, b, e, t]
        """
        return [char for char in sequence]

    def __str__(self):
        str = "Vocab information:\n"
        for idx, char in self.idx2char.items():
            str += "Char: %s Index: %d\n" % (char, idx)
        return str


class DataTransformer(object):

    def __init__(self, path, use_cuda):
        self.indices_sequences = []
        self.use_cuda = use_cuda

        # Load and build the vocab
        self.vocab = Vocabulary()
        self.vocab.build_vocab(path)
        self.PAD_ID = self.vocab.char2idx["PAD"]
        self.SOS_ID = self.vocab.char2idx["SOS"]
        self.vocab_size = self.vocab.num_chars
        self.max_length = self.vocab.max_length

        self._build_training_set(path)

    def _build_training_set(self, path):
        # Change sentences to indices, and append <EOS> at the end of all pairs
        for word in self.vocab.word_list:
            indices_seq = self.vocab.sequence_to_indices(word, add_eos=True)
            # input and target are the same in auto-encoder
            self.indices_sequences.append([indices_seq, indices_seq[:]])
            if indices_seq==indices_seq[:]:
                print([indices_seq, indices_seq[:]])

    def mini_batches(self, batch_size):
        input_batches = []
        target_batches = []

        np.random.shuffle(self.indices_sequences)

        #print(self.indices_sequences[0:0+batch_size])
        mini_batches = [
            self.indices_sequences[k: k + batch_size]
            for k in range(0, len(self.indices_sequences), batch_size)
        ]      
        print('\nmini:')
        print(mini_batches[0:2])

        for batch in mini_batches:
            seq_pairs = sorted(batch, key=lambda seqs: len(seqs[0]), reverse=True)  # sorted by input_lengths
            input_seqs = [pair[0] for pair in seq_pairs]
            target_seqs = [pair[1] for pair in seq_pairs]

            input_lengths = [len(s) for s in input_seqs]
            in_max = input_lengths[0]
            input_padded = [self.pad_sequence(s, in_max) for s in input_seqs]

            target_lengths = [len(s) for s in target_seqs]
            out_max = target_lengths[0]
            target_padded = [self.pad_sequence(s, out_max) for s in target_seqs]

            input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)  # time * batch
            target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)  # time * batch

            if self.use_cuda:
                input_var = input_var.cuda()
                target_var = target_var.cuda()

            yield (input_var, input_lengths), (target_var, target_lengths)

    def pad_sequence(self, sequence, max_length):
        sequence += [self.PAD_ID for i in range(max_length - len(sequence))]
        return sequence

    def evaluation_batch(self, words):
        """
        Prepare a batch of var for evaluating
        :param words: a list, store the testing data 
        :return: evaluation_batch
        """
        evaluation_batch = []

        for word in words:
            indices_seq = self.vocab.sequence_to_indices(word, add_eos=True)
            evaluation_batch.append([indices_seq])

        seq_pairs = sorted(evaluation_batch, key=lambda seqs: len(seqs[0]), reverse=True)
        input_seqs = [pair[0] for pair in seq_pairs]
        input_lengths = [len(s) for s in input_seqs]
        in_max = input_lengths[0]
        input_padded = [self.pad_sequence(s, in_max) for s in input_seqs]

        input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)  # time * batch

        if self.use_cuda:
            input_var = input_var.cuda()

        return input_var, input_lengths

if __name__ == '__main__':
    vocab = Vocabulary()
    vocab.build_vocab('Google-10000-English.txt')
    print(vocab)

    test = "helloworld"
    print("Sequence before transformed:", test)
    ids = vocab.sequence_to_indices(test)
    print("Indices sequence:", ids)
    sent = vocab.indices_to_sequence(ids)
    print("Sequence after transformed:",sent,"\n")

    data_transformer = DataTransformer('Google-10000-English.txt', use_cuda=False)

    for ib, tb in data_transformer.mini_batches(batch_size=3):
        print("\n\nB0-0")
        print(ib)#,'\n\n', tb)
        
        print('------------')
        embedding = nn.Embedding(30,3)
        input=ib[0]
        embedded=embedding(input)
        print(embedded)
        
        print('============================================')
        leng = ib[1]
        print(type(leng))
        print(leng)
        packed = pack_padded_sequence(embedded, leng)
        print('\n\n')
        print(packed)
        break

Vocab information:
Char: SOS Index: 0
Char: EOS Index: 1
Char: PAD Index: 2
Char: UNK Index: 3
Char: t Index: 4
Char: h Index: 5
Char: e Index: 6
Char: o Index: 7
Char: f Index: 8
Char: a Index: 9
Char: n Index: 10
Char: d Index: 11
Char: i Index: 12
Char: r Index: 13
Char: s Index: 14
Char: b Index: 15
Char: y Index: 16
Char: w Index: 17
Char: u Index: 18
Char: m Index: 19
Char: l Index: 20
Char: v Index: 21
Char: c Index: 22
Char: p Index: 23
Char: g Index: 24
Char: k Index: 25
Char: x Index: 26
Char: j Index: 27
Char: z Index: 28
Char: q Index: 29

Sequence before transformed: helloworld
Indices sequence: [5, 6, 20, 20, 7, 17, 7, 13, 20, 11]
Sequence after transformed: helloworld 


mini:
[[[[27, 7, 15, 14, 1], [27, 7, 15, 14, 1]], [[10, 6, 4, 14, 22, 9, 23, 6, 1], [10, 6, 4, 14, 22, 9, 23, 6, 1]], [[9, 20, 7, 10, 24, 1], [9, 20, 7, 10, 24, 1]]], [[[17, 6, 11, 1], [17, 6, 11, 1]], [[24, 7, 21, 6, 13, 10, 19, 6, 10, 4, 14, 1], [24, 7, 21, 6, 13, 10, 19, 6, 10, 4, 14, 1]], [[7, 23, 23

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

embedding = nn.Embedding(51,3)
input = torch.LongTensor([[9,5,9,1,2],[4,1,2,2,2],[40,40,40,40,1],[4,4,1,2,2]])
embedded=embedding(input)
print(embedded)


leng = torch.tensor([4,2,5,3])
packed = pack_padded_sequence(embedded, leng, batch_first=True, enforce_sorted=False)
print('\n\n')
print(packed)

In [28]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

for ib, tb in data_transformer.mini_batches(batch_size=3):
        embedding = nn.Embedding(30,3)
        input=ib[0]
        embedded=embedding(input)
        print(embedded)
        
        leng = ib[1]
        #print(type(leng))
        #print(leng)
        packed = pack_padded_sequence(embedded, leng)
        print('\n\n')
        print(packed)
        break


mini:
[[[[23, 5, 9, 10, 4, 7, 19, 1], [23, 5, 9, 10, 4, 7, 19, 1]], [[24, 9, 15, 13, 12, 6, 20, 1], [24, 9, 15, 13, 12, 6, 20, 1]], [[14, 6, 6, 11, 1], [14, 6, 6, 11, 1]]], [[[22, 7, 13, 13, 18, 23, 4, 12, 7, 10, 1], [22, 7, 13, 13, 18, 23, 4, 12, 7, 10, 1]], [[6, 23, 12, 14, 7, 11, 6, 14, 1], [6, 23, 12, 14, 7, 11, 6, 14, 1]], [[6, 26, 23, 20, 7, 13, 6, 1], [6, 26, 23, 20, 7, 13, 6, 1]]]]
tensor([[[-0.5840,  0.2594, -0.3392],
         [ 0.9600, -0.8475, -0.9238],
         [-0.3586,  1.8042, -0.0465]],

        [[-0.1190,  0.7071, -0.5437],
         [-0.1076, -1.4298, -0.8118],
         [-0.6915, -1.4748, -0.8687]],

        [[-0.1076, -1.4298, -0.8118],
         [ 0.8873,  0.4930,  1.2483],
         [-0.6915, -1.4748, -0.8687]],

        [[-0.5564,  0.9398,  0.5524],
         [-0.9800, -0.2611,  0.5359],
         [ 0.0237,  0.5845, -0.3780]],

        [[ 1.5539, -0.4662,  1.2247],
         [-0.1590, -2.0524,  0.5597],
         [ 0.3480,  0.8226,  0.3421]],

        [[-0.9431,  0.3944

In [69]:
a = [[[1,2,3,4],[1,2,3,4]],[[4,5,6,7,8,9],[4,5,6,7,8,9]],[[7,8,9],[7,8,9]],[[10,11],[10,11]], [[1,3],[1,3]],[[44,45],[44,45]]]
batch_size=3
print(len(a))
b = [
        a[k: k + batch_size]
        for k in range(0, len(a), batch_size)
    ]
print(b)
print('----------------------------------')
for batch in b:
    print(batch)
    seq_pairs = sorted(batch, key=lambda seqs: len(seqs[0]), reverse=True)  # sorted by input_lengths
    print(seq_pairs)
    input_seqs = [pair[0] for pair in seq_pairs]
    print(input_seqs)
    target_seqs = [pair[1] for pair in seq_pairs]
    print(target_seqs)
    print('\n\n')

6
[[[[1, 2, 3, 4], [1, 2, 3, 4]], [[4, 5, 6, 7, 8, 9], [4, 5, 6, 7, 8, 9]], [[7, 8, 9], [7, 8, 9]]], [[[10, 11], [10, 11]], [[1, 3], [1, 3]], [[44, 45], [44, 45]]]]
----------------------------------
[[[1, 2, 3, 4], [1, 2, 3, 4]], [[4, 5, 6, 7, 8, 9], [4, 5, 6, 7, 8, 9]], [[7, 8, 9], [7, 8, 9]]]
[[[4, 5, 6, 7, 8, 9], [4, 5, 6, 7, 8, 9]], [[1, 2, 3, 4], [1, 2, 3, 4]], [[7, 8, 9], [7, 8, 9]]]
[[4, 5, 6, 7, 8, 9], [1, 2, 3, 4], [7, 8, 9]]
[[4, 5, 6, 7, 8, 9], [1, 2, 3, 4], [7, 8, 9]]



[[[10, 11], [10, 11]], [[1, 3], [1, 3]], [[44, 45], [44, 45]]]
[[[10, 11], [10, 11]], [[1, 3], [1, 3]], [[44, 45], [44, 45]]]
[[10, 11], [1, 3], [44, 45]]
[[10, 11], [1, 3], [44, 45]]





In [75]:
decoder_input = Variable(torch.LongTensor([[2 * 128])))

SyntaxError: invalid syntax (<ipython-input-75-71764765c2cc>, line 1)

In [77]:
decoder_input = Variable(torch.LongTensor([[1] * 128]))
print(decoder_input)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])


In [79]:
decoder_outputs = Variable(torch.zeros(
            8,
            3,        #128
            4  #30
        ))
print(decoder_outputs)

tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])
