In [10]:
import torch
import numpy as np
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable


class Vocabulary(object):  #vocabulary set

    def __init__(self):
        self.char2idx = {'SOS': 0, 'EOS': 1, 'PAD': 2, 'UNK': 3}
        self.idx2char = {0: 'SOS', 1: 'EOS', 2: 'PAD', 3: 'UNK'}
        self.num_chars = 4  # num of char
        self.max_length = 0   #max length word
        self.word_list = []   #['word1','word2','word3',....]

    def build_vocab(self, data_path):
        """Construct the relation between words and indices"""
        with open(data_path, 'r', encoding='utf-8') as dataset:
            for word in dataset:
                word = word.strip('\n')

                self.word_list.append(word)      # renew self.max_length
                if self.max_length < len(word):
                    self.max_length = len(word)

                chars = self.split_sequence(word)
                for char in chars:
                    if char not in self.char2idx:
                        self.char2idx[char] = self.num_chars
                        self.idx2char[self.num_chars] = char
                        self.num_chars += 1

    def sequence_to_indices(self, sequence, add_eos=False, add_sos=False):  #reture a list ex: [28,22,13,1,2]
        """Transform a char sequence to index sequence
            :param sequence: a string composed with chars
            :param add_eos: if true, add the <EOS> tag at the end of given sentence
            :param add_sos: if true, add the <SOS> tag at the beginning of given sentence
        """
        index_sequence = [self.char2idx['SOS']] if add_sos else []

        for char in self.split_sequence(sequence):
            if char not in self.char2idx:
                index_sequence.append((self.char2idx['UNK']))
            else:
                index_sequence.append(self.char2idx[char])

        if add_eos:
            index_sequence.append(self.char2idx['EOS'])

        return index_sequence

    def indices_to_sequence(self, indices):  #return string 'apple'
        """Transform a list of indices
            :param indices: a list
        """
        sequence = ""
        for idx in indices:
            char = self.idx2char[idx]
            if char == "EOS":
                break
            else:
                sequence += char
        return sequence
        """
        sequence = ""
        for idx in indices:
            if idx.item() in self.idx2char:
                char = self.idx2char[idx.item()]
                if char == "EOS":
                    break
                else:
                    sequence += char
            else:
                print(idx)
        return sequence
        """
    def split_sequence(self, sequence):   #Return: [a, l, p, h, a, b, e, t]
        """Vary from languages and tasks. In our task, we simply return chars in given sentence
        For example:
            Input : alphabet
            Return: [a, l, p, h, a, b, e, t]
        """
        return [char for char in sequence]

    def __str__(self):
        str = "Vocab information:\n"
        for idx, char in self.idx2char.items():
            str += "Char: %s Index: %d\n" % (char, idx)
        return str


class DataTransformer(object):

    def __init__(self, path, use_cuda):
        self.indices_sequences = []
        self.use_cuda = use_cuda

        # Load and build the vocab
        self.vocab = Vocabulary()
        self.vocab.build_vocab(path)
        self.PAD_ID = self.vocab.char2idx["PAD"]
        self.SOS_ID = self.vocab.char2idx["SOS"]
        self.vocab_size = self.vocab.num_chars
        self.max_length = self.vocab.max_length

        self._build_training_set(path)

    def _build_training_set(self, path):   #prepare indices_sequences from vocan.word_list
        # Change sentences to indices, and append <EOS> at the end of all pairs
        for word in self.vocab.word_list:
            indices_seq = self.vocab.sequence_to_indices(word, add_eos=True)
            # input and target are the same in auto-encoder
            self.indices_sequences.append([indices_seq, indices_seq[:]])   #same  ex:[[4, 5, 6, 1], [4, 5, 6, 1]]

    def mini_batches(self, batch_size):
        input_batches = []  #useless
        target_batches = []  #useless

        np.random.shuffle(self.indices_sequences)
        mini_batches = [                            # if batch_size = 3 : [ [1st pair,2nd pari,3nd pair],[4..,5..,6..],[7,8,9],[10,11,12].... ]
            self.indices_sequences[k: k + batch_size]
            for k in range(0, len(self.indices_sequences), batch_size)
        ]

        for batch in mini_batches:
            seq_pairs = sorted(batch, key=lambda seqs: len(seqs[0]), reverse=True)  # sorted by input_lengths
            input_seqs = [pair[0] for pair in seq_pairs]
            target_seqs = [pair[1] for pair in seq_pairs]

            input_lengths = [len(s) for s in input_seqs]
            in_max = input_lengths[0]  #already sorted, so [0] is longest
            input_padded = [self.pad_sequence(s, in_max) for s in input_seqs]

            target_lengths = [len(s) for s in target_seqs]
            out_max = target_lengths[0]
            target_padded = [self.pad_sequence(s, out_max) for s in target_seqs]

            #input&targer to tensor variable
            input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)  
            target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)  # time * batch

            if self.use_cuda:
                input_var = input_var.cuda()
                target_var = target_var.cuda()

            yield (input_var, input_lengths), (target_var, target_lengths)   # ( [7,8,9,3,1],5),([7,8,9,3,1],5)

    def pad_sequence(self, sequence, max_length):
        sequence += [self.PAD_ID for i in range(max_length - len(sequence))]
        return sequence

    def evaluation_batch(self, words):
        """
        Prepare a batch of var for evaluating
        :param words: a list, store the testing data 
        :return: evaluation_batch
        """
        evaluation_batch = []

        for word in words:
            indices_seq = self.vocab.sequence_to_indices(word, add_eos=True)
            evaluation_batch.append([indices_seq])

        seq_pairs = sorted(evaluation_batch, key=lambda seqs: len(seqs[0]), reverse=True)
        input_seqs = [pair[0] for pair in seq_pairs]
        input_lengths = [len(s) for s in input_seqs]
        in_max = input_lengths[0]
        input_padded = [self.pad_sequence(s, in_max) for s in input_seqs]

        input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)  # time * batch

        if self.use_cuda:
            input_var = input_var.cuda()

        return input_var, input_lengths

"""
if __name__ == '__main__':
    vocab = Vocabulary()
    vocab.build_vocab('Google-10000-English.txt')
    print(vocab)

    test = "helloworld"
    print("Sequence before transformed:", test)
    ids = vocab.sequence_to_indices(test)
    print("Indices sequence:", ids)
    sent = vocab.indices_to_sequence(ids)
    print("Sequence after transformed:",sent,"\n")

    data_transformer = DataTransformer('Google-10000-English.txt', use_cuda=False)

    for ib, tb in data_transformer.mini_batches(batch_size=3):
        print("\n\nB0-0")
        print(ib)#,'\n\n', tb)
        
        print('------------')
        embedding = nn.Embedding(30,3)
        input=ib[0]
        embedded=embedding(input)
        print(embedded)
        
        print('============================================')
        leng = ib[1]
        print(type(leng))
        print(leng)
        packed = pack_padded_sequence(embedded, leng)
        print('\n\n')
        print(packed)
        break
"""
print('OK')

OK


## show vocab dict

In [4]:
vocab = Vocabulary()
vocab.build_vocab('Google-10000-English.txt')
print(vocab)

Vocab information:
Char: SOS Index: 0
Char: EOS Index: 1
Char: PAD Index: 2
Char: UNK Index: 3
Char: t Index: 4
Char: h Index: 5
Char: e Index: 6
Char: o Index: 7
Char: f Index: 8
Char: a Index: 9
Char: n Index: 10
Char: d Index: 11
Char: i Index: 12
Char: r Index: 13
Char: s Index: 14
Char: b Index: 15
Char: y Index: 16
Char: w Index: 17
Char: u Index: 18
Char: m Index: 19
Char: l Index: 20
Char: v Index: 21
Char: c Index: 22
Char: p Index: 23
Char: g Index: 24
Char: k Index: 25
Char: x Index: 26
Char: j Index: 27
Char: z Index: 28
Char: q Index: 29



## sequence to indices example

In [3]:
test = "helloworld"
print("Sequence before transformed:", test)
ids = vocab.sequence_to_indices(test)
print("Indices sequence:", ids)
sent = vocab.indices_to_sequence(ids)
print("Sequence after transformed:",sent,"\n")

Sequence before transformed: helloworld
Indices sequence: [5, 6, 20, 20, 7, 17, 7, 13, 20, 11]
Sequence after transformed: helloworld 



## embedding & pack_padding_sequence example

In [8]:
data_transformer = DataTransformer('Google-10000-English.txt', use_cuda=False)

for ib, tb in data_transformer.mini_batches(batch_size=3):
    print("\n1st batch:")
    print('input:\n',ib,'\n\n','target:\n', tb)
        
    print('\n------------\nafter embedding(30,3):\n')
    embedding = nn.Embedding(30,3)
    input=ib[0]
    embedded=embedding(input)
    print(embedded)
        
    print('============================================')
    leng = ib[1]
    #print(type(leng))
    #print(leng)
    packed = pack_padded_sequence(embedded, leng)
    print('\n\n')
    print(packed)
    break


1st batch:
input:
 (tensor([[11,  9, 18],
        [12, 20, 10],
        [14, 22, 12],
        [23,  7,  4],
        [20,  5,  1],
        [ 9,  7,  2],
        [16, 20,  2],
        [ 6,  1,  2],
        [11,  2,  2],
        [ 1,  2,  2]]), [10, 8, 5]) 

 target:
 (tensor([[11,  9, 18],
        [12, 20, 10],
        [14, 22, 12],
        [23,  7,  4],
        [20,  5,  1],
        [ 9,  7,  2],
        [16, 20,  2],
        [ 6,  1,  2],
        [11,  2,  2],
        [ 1,  2,  2]]), [10, 8, 5])

------------
after embedding(30,3):

tensor([[[-0.3962, -0.3535, -2.0223],
         [ 1.4845,  1.2232,  0.3405],
         [ 1.2795,  0.8027, -1.3001]],

        [[-0.5129,  0.2063,  1.1299],
         [ 0.2351,  0.2509,  0.5732],
         [ 0.6289, -0.7532, -0.7337]],

        [[ 1.3063,  0.3306, -1.2231],
         [ 1.6357,  0.7090, -0.0940],
         [-0.5129,  0.2063,  1.1299]],

        [[-0.8622, -0.8680, -0.4603],
         [ 0.3462,  0.1498,  1.1750],
         [ 1.6731,  0.6317, -1.0507]

## embedding & pack_padding_sequence example 2

In [11]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [13]:
embedding = nn.Embedding(51,3)
input = torch.LongTensor([[9,5,9,1,2],[4,1,2,2,2],[40,40,40,40,1],[4,4,1,2,2]])
embedded=embedding(input)
print(embedded)

tensor([[[ 1.4367, -1.2376, -0.7855],
         [ 0.1500,  0.1474, -0.2132],
         [ 1.4367, -1.2376, -0.7855],
         [ 1.1864,  0.5637, -1.1194],
         [ 0.3134,  0.5307, -0.2227]],

        [[ 0.8141, -0.1185, -0.0485],
         [ 1.1864,  0.5637, -1.1194],
         [ 0.3134,  0.5307, -0.2227],
         [ 0.3134,  0.5307, -0.2227],
         [ 0.3134,  0.5307, -0.2227]],

        [[-0.0411,  0.3431,  0.2645],
         [-0.0411,  0.3431,  0.2645],
         [-0.0411,  0.3431,  0.2645],
         [-0.0411,  0.3431,  0.2645],
         [ 1.1864,  0.5637, -1.1194]],

        [[ 0.8141, -0.1185, -0.0485],
         [ 0.8141, -0.1185, -0.0485],
         [ 1.1864,  0.5637, -1.1194],
         [ 0.3134,  0.5307, -0.2227],
         [ 0.3134,  0.5307, -0.2227]]], grad_fn=<EmbeddingBackward>)


## pack_padded_sequence

In [15]:
leng = torch.tensor([4,2,5,3])
packed = pack_padded_sequence(embedded, leng, batch_first=True, enforce_sorted=False)
#print('\n\n')
print(packed)

PackedSequence(data=tensor([[-0.0411,  0.3431,  0.2645],
        [ 1.4367, -1.2376, -0.7855],
        [ 0.8141, -0.1185, -0.0485],
        [ 0.8141, -0.1185, -0.0485],
        [-0.0411,  0.3431,  0.2645],
        [ 0.1500,  0.1474, -0.2132],
        [ 0.8141, -0.1185, -0.0485],
        [ 1.1864,  0.5637, -1.1194],
        [-0.0411,  0.3431,  0.2645],
        [ 1.4367, -1.2376, -0.7855],
        [ 1.1864,  0.5637, -1.1194],
        [-0.0411,  0.3431,  0.2645],
        [ 1.1864,  0.5637, -1.1194],
        [ 1.1864,  0.5637, -1.1194]], grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([4, 4, 3, 2, 1]), sorted_indices=tensor([2, 0, 3, 1]), unsorted_indices=tensor([1, 3, 0, 2]))


![Alt text](https://yifdu.github.io/2019/03/28/Pytorch-tutorials-%E5%AD%A6%E4%B9%A0%EF%BC%88%E5%85%AD%EF%BC%89/pic4.png "Optional title")

## NLLLoss

In [77]:
input = torch.randn(3,3) #len=3, batch=2, char =5 kinds
input

tensor([[ 0.2598, -0.0980, -1.2113],
        [-2.5211,  0.1555,  0.3407],
        [ 0.4294, -0.0681,  0.2158]])

In [78]:
sm = nn.Softmax(dim=1)
# dim means Softmax's dim, here use 2 (char kind)
sm(input)

tensor([[0.5184, 0.3625, 0.1191],
        [0.0303, 0.4401, 0.5296],
        [0.4139, 0.2517, 0.3344]])

In [80]:
torch.log(sm(input))

tensor([[-0.6569, -1.0147, -2.1281],
        [-3.4973, -0.8208, -0.6356],
        [-0.8820, -1.3795, -1.0956]])

NLLLoss的結果就是把上面的輸出與Label對應的那個值拿出來，再去掉負號，再求均值
ex : ground truth是(0,2,1)，那NLLLose的算法是：

In [81]:
(0.6569+0.6356+1.3795)/3

0.8906666666666666

In [85]:
loss=nn.NLLLoss()
target=torch.tensor([0,2,1])
loss(torch.log(sm(input)),target)

tensor(0.8907)

## batch (3 dim)

In [111]:
input = torch.randn(3,2,5)
input

tensor([[[-0.2760,  0.4598,  0.2189,  0.4483,  0.9722],
         [ 0.8616, -1.2340,  1.1155,  1.8708,  0.5987]],

        [[-1.0815, -0.0203, -0.1701, -0.8232,  1.5321],
         [ 1.6136,  0.6235, -0.9264, -0.0197,  1.6300]],

        [[ 0.6959,  0.1608,  0.4912,  0.4067,  0.4182],
         [ 0.6458, -0.0346, -0.4760, -0.9156, -0.3314]]])

In [112]:
torch.log(sm(input))

tensor([[[-2.3297, -1.5939, -1.8348, -1.6054, -1.0815],
         [-1.7790, -3.8746, -1.5252, -0.7699, -2.0419]],

        [[-3.0597, -1.9984, -2.1482, -2.8013, -0.4461],
         [-0.9792, -1.9693, -3.5192, -2.6124, -0.9628]],

        [[-1.3627, -1.8978, -1.5675, -1.6520, -1.6404],
         [-0.8831, -1.5634, -2.0049, -2.4445, -1.8602]]])

In [122]:
((2.3297+2.1482+1.8978)/3 + (1.7790+3.5192+1.5634)/3)/2


2.2062166666666667

In [120]:
loss=nn.NLLLoss()

target=torch.tensor([[0,0],[2,2,],[1,1]])
targets = target.contiguous().view(-1)  # S = (B*T)       #to one dim
decoder_outputs = torch.log(sm(input)).view(3*2, -1)  # S = (B*T) x V   #b*t,30   #counting cost, only require pairs to be meeted

loss(decoder_outputs,targets)

tensor(2.2062)