In [1]:
from matplotlib import pyplot as plt
import torch
from torch.utils.data import TensorDataset,DataLoader
import numpy as np
import pandas as pd
import warnings
import time
import seaborn
from torch import nn
from typing import Optional,Union,List
from torchvision import transforms
import re

#### 文本读取

In [2]:
# 读取语料
# 注意不要写到同一次读取操作中,readlines或者read中任意一个都会占用文件流导致另外一个失效
with open("timemachine.txt", 'r') as file:
    text_list = file.readlines()
with open("timemachine.txt", 'r') as file:
    text_str = file.read()


text_list[10],text_str[:100]

('twinkled, and his usually pale face was flushed and animated. The\n',
 'The Time Machine, by H. G. Wells [1898]\n\n\n\n\nI\n\n\nThe Time Traveller (for so it will be convenient to ')

#### 文本预处理

In [3]:
# 利用正则表达式处理文本,将字母以外的标点符号替换为空格并都转换为小写
text_list = [re.sub('[^A-Za-z]+',' ',line).strip().lower() for line in text_list]
text_str = re.sub('[^A-Za-z]+',' ',text_str).strip().lower()
text_list[10], text_str[:100]

('twinkled and his usually pale face was flushed and animated the',
 'the time machine by h g wells i the time traveller for so it will be convenient to speak of him was ')

#### 构建词元

In [4]:
def tokenize(text:Union[List[str],str], token='word'):
    # 以单词作为词元
    if token=='word':
        if type(text) is list:
            return [line.split() for line in text]
        else:
            return text.split()
    # 以字符作为词元
    elif token=='char':
        if type(text) is list:
            return [list(line) for line in text]
        else:
            return list(text)
    else:
        raise TypeError("未知词元类型")

tokens_list_bychar = tokenize(text_list,'char')
tokens_str_bychar = tokenize(text_str,'char')
tokens_list_bychar[0]==tokens_str_bychar[:29]

True

In [5]:
tokens_list_byword = tokenize(text_list)
tokens_str_byword = tokenize(text_str)
tokens_list_byword[0]==tokens_str_byword[:7]

True

#### 构建词汇表,词元和索引之间的映射关系

In [15]:
from collections import Counter
class Vocab:
    def __init__(self, tokens:List, min_freq=0, reserved_tokens:List=[]) -> None:
        # 以字符作为词元时,tokens的形状应该为2维的list
        # 将2维列表拉直
        if tokens and isinstance(tokens[0],list):
            tokens = [char for line in tokens for char in line]
        # 计算每个char出现的频率
        counter = Counter(tokens)
        # 降序排列
        self.token_freqs = sorted(counter.items(),key=lambda x:x[1], reverse=True)
        # 制作词元索引
        # 语料库中不存在或已删除的任何词元都将映射到一个特定的未知词元类  '<unk>(unknown token)'
        self.index_to_token = reserved_tokens + ['<unk>']
        # 词元token类映射到index
        self.token_to_index = {token: idx for idx,token in enumerate(self.index_to_token)}

        for token, freq in self.token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_index:
                # 将词元类添加到词元表中
                self.index_to_token.append(token)
                # token类映射到index的dict对应的token的index设置为 len - 1
                self.token_to_index[token] = len(self.index_to_token) - 1
    
    def __len__(self):
        return len(self.index_to_token)
    
    @property
    def unk(self):
        return self.token_to_index['<unk>']

    def __getitem__(self, tokens):
        # 重载[]运算符
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_index.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices,(list,tuple)) or isinstance(indices, slice):
            return self.index_to_token[indices]
        return [self.index_to_token[indice] for indice in indices]

# 构建训练和测试数据格式
def build(text, vocab=None, token='word', reserved_tokens=[]):
    tokens = tokenize(text, token)
    if vocab is None:
        vocab = Vocab(tokens,reserved_tokens=reserved_tokens)
    # 这里可以直接用tokens索引,但是当tokens是二维的时候需要额外拉直,故采用for创建
    corpus = [vocab[tk] for tk in tokens]
    return corpus, vocab

    

        

# 测试词元为char
temp = Vocab(tokens_list_bychar)
print(temp.token_freqs)
# temp = Vocab(tokens_str_bychar,reserved_tokens=['<pad>','<bos>'])
# print(temp['<unk>','<pad>'])# getitem
print(temp.to_tokens(slice(1,5)))
print(temp.to_tokens([1,4]))


[(' ', 29927), ('e', 17838), ('t', 13515), ('a', 11704), ('i', 10138), ('n', 9917), ('o', 9758), ('s', 8486), ('h', 8257), ('r', 7674), ('d', 6337), ('l', 6146), ('m', 4043), ('u', 3805), ('c', 3424), ('f', 3354), ('w', 3225), ('g', 3075), ('y', 2679), ('p', 2427), ('b', 1897), ('v', 1295), ('k', 1087), ('x', 236), ('z', 144), ('j', 97), ('q', 95)]
[' ', 'e', 't', 'a']
[' ', 'a']


In [9]:
# 测试word作为词元
temp = Vocab(tokens_list_byword)
print(temp.to_tokens(slice(1,5)), len(temp))

['the', 'i', 'and', 'of'] 4580


In [17]:
# 测试build函数功能
X,_ = build(text_str)
X[:10]

[1, 19, 50, 40, 2183, 2184, 400, 2, 1, 19]