In [None]:
import torch
import matplotlib.pyplot as plt
import sys
sys.path.append('..')
import myd2l

In [2]:
def read_data_nmt(path='E:/Datasets/Tatoeba-fra-eng/fra-eng/fra.txt'):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()
    
raw_text = read_data_nmt()
print(raw_text[: 75])
    

Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



In [3]:
def preprocess_nmt(text):
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text)]

    return ''.join(out)

text = preprocess_nmt(raw_text)
print(text[:80])

go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !


In [4]:
def tokenize_nmt(text, num_examples=None):
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break

        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    
    return source, target

source, target = tokenize_nmt(text)
print(source[:6])
print(target[:6])

print('cuff' in set([token for lines in source for token in lines]))

[['go', '.'], ['hi', '.'], ['run', '!'], ['run', '!'], ['who', '?'], ['wow', '!']]
[['va', '!'], ['salut', '!'], ['cours', '!'], ['courez', '!'], ['qui', '?'], ['ça', 'alors', '!']]
True


In [None]:
src_vocab = myd2l.Vocab(source, 2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
print(len(src_vocab))
print(src_vocab['cuff'])
print(src_vocab['<eos>'])

10012
0
3


In [35]:
def truncate_pad(line, num_steps, padding_token):
    if(len(line) > num_steps):
        line = line[: num_steps]
    else:
        line += [padding_token] * (num_steps - len(line))
    
    return line

In [36]:
truncate_pad(src_vocab[source[0]], 5, src_vocab['<pad>'])

[47, 4, 1, 1, 1]

In [37]:
def build_arr_nmt(lines, vocab, num_steps):
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(dim=1)

    return array, valid_len

array, valid_len = build_arr_nmt(source, src_vocab, 8)
print(array[1])
print(valid_len)

tensor([2944,    4,    3,    1,    1,    1,    1,    1])
tensor([3, 3, 3,  ..., 8, 8, 8])


In [None]:
def load_data_nmt(batch_size, num_steps, num_examples=600):
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = myd2l.Vocab(source, 2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = myd2l.Vocab(target, 2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_arr_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_arr_nmt (target, tgt_vocab, num_steps)

    dataset = torch.utils.data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return data_iter, src_vocab, tgt_vocab

In [41]:
train_iter, src_vocab, tgt_vocab = load_data_nmt(2, 8)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X: ', X.type(torch.int32))
    print('X\'s valid length: ', X_valid_len)
    print('Y: ', Y.type(torch.int32))
    print('Y\' valid length: ', Y_valid_len)
    break


X:  tensor([[30, 38,  4,  3,  1,  1,  1,  1],
        [68, 25,  5,  3,  1,  1,  1,  1]], dtype=torch.int32)
X's valid length:  tensor([4, 4])
Y:  tensor([[ 0,  5,  3,  1,  1,  1,  1,  1],
        [89,  5,  3,  1,  1,  1,  1,  1]], dtype=torch.int32)
Y' valid length:  tensor([3, 3])
