In [1]:
import os
import random
import data_tokenize
import torch
import Utility

In [2]:
#@save
def _read_xhj(token='char'):
    convs = data_tokenize.get_convs()
    paragraphs = data_tokenize.tokenize(convs, token)
    random.shuffle(paragraphs)
    return paragraphs
xhj_data = _read_xhj()
xhj_data[:5]

Using 'char' as tokens


[[['只', '知', '道', '坡', '姐'],
  ['因',
   '为',
   '我',
   '讨',
   '厌',
   '自',
   '己',
   '那',
   '种',
   '跟',
   '别',
   '人',
   '在',
   '一',
   '起',
   '心',
   '里',
   '却',
   '闪',
   '现',
   '出',
   '另',
   '一',
   '个',
   '人',
   '的',
   '感',
   '觉',
   '我',
   '不',
   '喜',
   '欢',
   '无',
   '法',
   '全',
   '部',
   '投',
   '入',
   '的',
   '我']],
 [['那', '你', '骂', '我', '傻', '逼'], ['拜', '拜']],
 [['现', '在', '变', '冷', '了'], ['哎', '呀', '~', '我', '喜', '欢']],
 [['爱', '我', '别', '走'], ['如', '果', '你', '说', '你', '不', '爱', '我']],
 [['我', '饿', '你', '给', '我', '啃', '个', '鸡', '腿', '嘛'],
  ['主', '人', '我', '错', '了', '呜', '呜', '呜']]]

In [3]:
vocab = data_tokenize.Vocab(xhj_data, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])

In [4]:
len(vocab)

5408

In [5]:
#@save
def truncate_pad(line, num_steps, padding_token):
    if len(line) > num_steps:
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line))

truncate_pad(vocab[xhj_data[0][0]], 10, vocab['<pad>'])

[99, 51, 48, 2301, 175, 1, 1, 1, 1, 1]

In [6]:
#@save
def build_array_xhj_for_Transformer(lines, vocab, num_steps):
    """Transform text sequences of machine translation into minibatches."""
    lines = [vocab[l] for l in lines]
    ask_lines = [l[0] + [vocab['<eos>']] for l in lines]
    answer_lines = [l[1] + [vocab['<eos>']] for l in lines]
    ask_array = torch.tensor(
        [truncate_pad(l, num_steps, vocab['<pad>']) for l in ask_lines])
    ask_valid_len = (ask_array != vocab['<pad>']).type(torch.int32).sum(1)
    answer_array = torch.tensor(
        [truncate_pad(l, num_steps, vocab['<pad>']) for l in answer_lines])
    answer_valid_len = (answer_array != vocab['<pad>']).type(torch.int32).sum(1)
    return ask_array, ask_valid_len, answer_array, answer_valid_len

In [7]:
#@save
def load_data_xhj_for_Transformer(batch_size, num_steps, num_examples=600):
    """Return the iterator and the vocabularies of the translation dataset."""
    xhj_data = _read_xhj()
    vocab = data_tokenize.Vocab(xhj_data, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    ask_array, ask_valid_len, answer_array, answer_valid_len = build_array_xhj_for_Transformer(xhj_data,vocab, num_steps)
    data_arrays = (ask_array, ask_valid_len, answer_array, answer_valid_len)
    data_iter = Utility.load_array(data_arrays, batch_size)
    return data_iter, vocab

In [8]:
train_iter, vocab = load_data_xhj_for_Transformer(batch_size=2, num_steps=8)
for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X:', X.type(torch.int32))
    print('valid lengths for X:', X_valid_len)
    print('Y:', Y.type(torch.int32))
    print('valid lengths for Y:', Y_valid_len)
    break

Using 'char' as tokens
X: tensor([[ 78, 330, 588,   3,   1,   1,   1,   1],
        [  4,  56,  26,  79,   3,   1,   1,   1]], dtype=torch.int32)
valid lengths for X: tensor([4, 5])
Y: tensor([[ 18,  35,  10,  70, 189,  19,  67,   5],
        [111, 111,  25, 270, 382,   7, 189, 369]], dtype=torch.int32)
valid lengths for Y: tensor([8, 8])
