In [55]:
import collections
import re
import torch
import d2l.torch as d2l
from typing import List, Tuple

# 预处理文本

In [56]:
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'time_machine', '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():
    with open('../data/timemachine.txt', 'r') as f:
        lines = [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in f.readlines()]
    return lines

lines = read_time_machine()
print(lines[0])
print(lines[233])

the time machine by h g wells
unaccountable thing he took one of the small octagonal tables that


# 文本分词

In [57]:
def tokenize(lines: List[str], token: str) -> List[List[str]]:
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        return []

tokens_word = tokenize(lines, 'word')
tokens_char = tokenize(lines, 'char')
print(tokens_word[0])
print(tokens_char[0])
print(tokens_word[233])
print(tokens_char[233])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm', 'a', 'c', 'h', 'i', 'n', 'e', ' ', 'b', 'y', ' ', 'h', ' ', 'g', ' ', 'w', 'e', 'l', 'l', 's']
['unaccountable', 'thing', 'he', 'took', 'one', 'of', 'the', 'small', 'octagonal', 'tables', 'that']
['u', 'n', 'a', 'c', 'c', 'o', 'u', 'n', 't', 'a', 'b', 'l', 'e', ' ', 't', 'h', 'i', 'n', 'g', ' ', 'h', 'e', ' ', 't', 'o', 'o', 'k', ' ', 'o', 'n', 'e', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 's', 'm', 'a', 'l', 'l', ' ', 'o', 'c', 't', 'a', 'g', 'o', 'n', 'a', 'l', ' ', 't', 'a', 'b', 'l', 'e', 's', ' ', 't', 'h', 'a', 't']


# 构建词表

In [58]:
class Vocab:
    def __init__(self, tokens: List[List[str]], min_freq: int, reversed_tokens: List[str]):
        tokens = [token for line in tokens for token in line]
        counter = collections.Counter(tokens)
        self.token_freq = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        self.unk = 0
        self.id_to_token =  ['unk'] + reversed_tokens
        self.token_to_id = {token: idx for idx, token in enumerate(self.id_to_token)}
        for token, freq in self.token_freq:
            if freq < min_freq:
                break
            self.id_to_token.append(token)
            self.token_to_id[token] = len(self.id_to_token) - 1

    def __len__(self):
        return len(self.id_to_token)

    def __getitem__(self, item):
        ret = []
        if isinstance(item, (list, tuple)):
            ret = [self[line] for line in item]
        elif isinstance(item, str):
            ret = self.token_to_id.get(item, self.unk)
        elif isinstance(item, int):
            ret = self.id_to_token[item]
        return ret


In [59]:
vocab = Vocab(tokens_word, 0, [])
print(vocab[[['the', 'apple'], ['for', 'man']]])
print(vocab[[[1, 2], [3, 4]]])

[[1, 3139], [16, 65]]
[['the', 'i'], ['and', 'of']]


# 文本编码

In [63]:
def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens, 0, [])
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[: max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_time_machine()
print(len(corpus), len(vocab))
print(corpus[:10])

170580 28
[3, 9, 2, 1, 3, 5, 13, 2, 1, 13]
