In [1]:
# 下载数据集
import requests
import os
import re

# 定义数据集的 URL 和校验信息
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'  # 假设这是 d2l.DATA_URL 的值
DATA_HUB = {}
DATA_HUB['time_machine'] = (DATA_URL + 'timemachine.txt', '090b5e7e70c295757f55df93cb0a180b9691891a')

def download_time_machine():
    # 使用定义的 URL 来获取文件内容
    url = DATA_HUB['time_machine'][0]
    response = requests.get(url)
    if response.status_code == 200:
        content = response.text
        data_dir = './DataSet'
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        file_path = os.path.join(data_dir, 'timemachine.txt')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
        print('文件下载成功！')
    else:
        print('文件下载失败！')
download_time_machine() 


# 验证文件
try:
    with open('./DataSet/timemachine.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    print("文件读取成功")
except FileNotFoundError:
    print("文件未找到")
except UnicodeDecodeError:
    print("文件编码错误")



文件下载成功！
文件读取成功


In [None]:
# 读取数据集
import collections
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch


def read_time_machine():
    """将《时间机器》数据集加载到文本行列表中。"""
    with open("./DataSet/timemachine.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
    return [re.sub("[^A-Za-z]+", " ", line).strip().lower() for line in lines]


lines = read_time_machine()
print(f"# 文本总行数: {len(lines)}")
print(lines[0])
print(lines[10])

In [None]:
# 数据集分词
# 下面的`tokenize`函数将文本行列表（`lines`）作为输入，
# 列表中的每个元素是一个文本序列（如一条文本行）。
# [**每个文本序列又被拆分成一个词元列表**]，*词元*（token）是文本的基本单位。
# 最后，返回一个由词元列表组成的列表，其中的每个词元都是一个字符串（string）。


def tokenize(lines, token="word"):  # @save
    """将文本行拆分为单词或字符词元"""
    if token == "word":
        return [line.split() for line in lines]
    elif token == "char":
        return [list(line) for line in lines]
    else:
        print("错误：未知词元类型：" + token)

In [None]:
# 将数据集构建词汇表
# 词元的类型是字符串，而模型需要的输入是数字，因此这种类型不方便模型使用。
# 现在，让我们[**构建一个字典，通常也叫做*词表*（vocabulary），
# 用来将字符串类型的词元映射到从$0$开始的数字索引中**]。
# 我们先将训练集中的所有文档合并在一起，对它们的唯一词元进行统计，
# 得到的统计结果称之为*语料*（corpus）。
# 然后根据每个唯一词元的出现频率，为其分配一个数字索引。
# 很少出现的词元通常被移除，这可以降低复杂性。
# 另外，语料库中不存在或已删除的任何词元都将映射到一个特定的未知词元“&lt;unk&gt;”。
# 我们可以选择增加一个列表，用于保存那些被保留的词元，
# 例如：填充词元（“&lt;pad&gt;”）；
# 序列开始词元（“&lt;bos&gt;”）；
# 序列结束词元（“&lt;eos&gt;”）。

# 使用的设备
device = torch.device("cuda")


class Vocab:  # @save
    """文本词表"""

    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # 按出现频率排序
        counter = count_corpus(tokens)
        self._token_freqs = sorted(
            counter.items(), key=lambda x: x[1], reverse=True)
        # 未知词元的索引为0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx,
                             token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

    @property
    def itos(self):
        """从索引到词元的映射列表"""
        return self.idx_to_token


def count_corpus(tokens):  # @save
    """统计词元的频率"""
    # 这里的tokens是1D列表或2D列表
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将词元列表展平成一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)


# 加载语料库
def load_corpus_time_machine(max_tokens=-1):
    lines = read_time_machine()
    tokens = tokenize(lines, 'char')
    vocab = Vocab(tokens)
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

# 填充与截断
def pad_truncate(sequence, max_length, padding_index=0):
    """对序列进行填充或截断操作。"""
    if len(sequence) < max_length:
        return sequence + [padding_index] * (max_length - len(sequence))
    else:
        return sequence[:max_length]


# 创建数据集
class TimeMachineDataset(Dataset):
    def __init__(self, corpus, max_length):
        self.corpus = [pad_truncate(seq, max_length) for seq in corpus]

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        sequence = self.corpus[idx]
        input_seq = torch.tensor(sequence[:-1])
        target_seq = torch.tensor(sequence[1:])
        return input_seq, target_seq


# 数据预处理
corpus, vocab = load_corpus_time_machine()
# 假设将数据分割成长度为 50 的序列
max_length = 50
sequences = [corpus[i:i+max_length]
             for i in range(0, len(corpus), max_length) if len(corpus[i:i+max_length]) == max_length]


# 创建数据集和数据加载器
dataset = TimeMachineDataset(sequences, max_length)
# 设置 drop_last=True 丢弃最后一个不完整的批次
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

# 使用的设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 打印数据加载器中的一个批次
for inputs, targets in dataloader:
    # 将输入和目标数据移动到 CUDA 设备
    inputs = inputs.to(device)
    targets = targets.to(device)
    print(f'输入形状: {inputs.shape}')
    print(f'目标形状: {targets.shape}')
    break

输入形状: torch.Size([32, 49])
目标形状: torch.Size([32, 49])


In [None]:
import torch
from torch.nn import Module
class RNN_Model(Module):
    def __init__(self, vocab_size, num_hiddens, device):
        super().__init__()
        self.vocab_size = vocab_size
        self.num_hiddens = num_hiddens
        self.device = device
        self.params = self.get_params()

    def get_params(self):
        num_inputs = num_outputs = self.vocab_size
        def normal(shape):
            return torch.randn(size=shape, device=self.device) * 0.01
        # 输入到隐藏层的权重
        W_xh = normal((num_inputs, self.num_hiddens))
        # 隐藏层到隐藏层的权重
        W_hh = normal((self.num_hiddens, self.num_hiddens))
        # 隐藏层的偏置
        b_h = torch.zeros(self.num_hiddens, device=self.device)
        # 隐藏层到输出层的权重
        W_hq = normal((self.num_hiddens, num_outputs))
        # 输出层的偏置
        b_q = torch.zeros(num_outputs, device=self.device)
        params = [W_xh, W_hh, b_h, W_hq, b_q]
        for param in params:
            param.requires_grad_(True)
        return params
    def init_rnn_state(self, batch_size):
        return (torch.zeros((batch_size, self.num_hiddens), device=self.device),)


    def rnn_forward(self, X, state):
        #print(f"State type: {type(state)}, State shape: {state[0].shape if isinstance(state, tuple) else state.shape}")
        # 从 params 中获取权重和偏置
        W_xh, W_hh, b_h, W_hq, b_q = self.params
        # 初始化 outputs 列表
        outputs = []
        # 初始化隐藏状态 H
        H, = state
        # 交换 X 的维度，使得形状为 (seq_length, batch_size)
        X = X.transpose(0, 1)
        for x in X:
            x = torch.nn.functional.one_hot(x, num_classes=self.vocab_size).float()
            # 确保 x 的数据类型和 W_xh 一致
            x = x.to(W_xh.dtype)
            #print(f"x shape: {x.shape}")
            # 确保 H 的数据类型和 W_hh 一致
            H = H.to(W_hh.dtype)
            H = torch.tanh(torch.mm(x, W_xh) + torch.mm(H, W_hh) + b_h)
            Y = torch.mm(H, W_hq) + b_q
            outputs.append(Y)
        output = torch.stack(outputs, dim=0).transpose(0, 1).contiguous().view(-1, self.vocab_size)
        #print(f"Output shape after rnn_forward: {output.shape}")
        return output, (H,)



    def forward(self, X, state):  # 修改 forward 方法以接受 state 参数
        output, new_state = self.rnn_forward(X, state)
        return output, new_state


def grad_clipping(model, theta):  # @save
    """裁剪梯度"""
    if isinstance(model, torch.nn.Module):
        params = [p for p in model.parameters() if p.requires_grad]
    else:
        params = model.params
    # 筛选出有梯度的参数
    grads = [p.grad for p in params if p.grad is not None]
    if len(grads) == 0:  # 如果没有梯度，直接返回
        return  
    # 使用 torch.stack 和 torch.sum 来计算梯度的平方和
    grad_squared_sum = torch.sum(torch.stack(
        [torch.sum(g ** 2) for g in grads]))
    norm = torch.sqrt(grad_squared_sum)
    if norm > theta:
        for param in params:
            if param.grad is not None:
                param.grad[:] *= theta / norm



import torch.optim as optim

def train(model, train_iter, lr, num_epochs, theta, device):
    optimizer = optim.SGD(model.params, lr=lr)
    loss = torch.nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        state = None
        metric = [0.0] * 2  # 训练损失之和, 词元数量
        for X, Y in train_iter:
            if state is None:
                state = model.init_rnn_state(X.shape[0])
            else:
                for s in state:
                    s.detach_()
            X, Y = X.to(device), Y.to(device)
            output, state = model(X, state)
            # print(f"Output shape before reshape: {output.shape}")
            # print(f"Target shape before reshape: {Y.shape}")
            y = Y.view(-1)  # 调整目标标签形状
            # print(f"Output shape after reshape: {output.shape}")
            # print(f"Target shape after reshape: {y.shape}")
            if output.size(0) != y.size(0):
                raise ValueError(
                    f"Output batch size ({output.size(0)}) does not match target batch size ({y.size(0)}).")
            l = loss(output, y.long()).mean()
            optimizer.zero_grad()
            l.backward()
            grad_clipping(model, theta)
            optimizer.step()
            metric[0] += l * y.numel()
            metric[1] += y.numel()
        if (epoch + 1) % 10 == 0:
            print(f'epoch {epoch + 1}/{num_epochs}, loss {metric[0] / metric[1]:.3f}')

def predict(model, prefix, num_preds, device):
    state = model.init_rnn_state(1)
    outputs = [vocab[prefix[0]]]  # 使用词汇表将字符转换为索引

    def get_input():
        return torch.tensor([outputs[-1]], device=device).reshape((1, 1))

    for y in prefix[1:]:  # 预热期
        _, state = model(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):  # 预测 num_preds 步
        y, state = model(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.itos[index] for index in outputs])

In [None]:
# 1. 定义必要的参数
vocab_size = len(vocab)  # 使用前面加载语料库时生成的词汇表大小
num_hiddens = 256
lr = 1
num_epochs = 200
theta = 0.5
device = torch.device("cuda")

# 2. 实例化模型
model = RNN_Model(vocab_size, num_hiddens, device)

# 3. 调用训练函数
train(model, dataloader, lr, num_epochs, theta, device)


epoch 10/200, loss 2.081
epoch 20/200, loss 1.891
epoch 30/200, loss 1.759
epoch 40/200, loss 1.669
epoch 50/200, loss 1.607
epoch 60/200, loss 1.560
epoch 70/200, loss 1.528
epoch 80/200, loss 1.503
epoch 90/200, loss 1.486
epoch 100/200, loss 1.468
epoch 110/200, loss 1.458
epoch 120/200, loss 1.451
epoch 130/200, loss 1.441
epoch 140/200, loss 1.431
epoch 150/200, loss 1.428
epoch 160/200, loss 1.420
epoch 170/200, loss 1.417
epoch 180/200, loss 1.410
epoch 190/200, loss 1.406
epoch 200/200, loss 1.406


In [6]:
# 4. 调用预测函数
prefix = "time traveller"  # 预测的前缀
num_preds = 100  # 预测的字符数量
result = predict(model, prefix, num_preds, device)
print("预测结果:", result)

预测结果: time traveller s soon and the same the same s see at first the same s suggestion of the struck and conster and exc
