In [8]:
# 文本预处理处理教学
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# 示例文本
texts = [
    "I love programming!",
    "I hate bugs",
    "Programming is amazing",
    "C++ is awesome but I prefer python",
]

# 创建分词器，限制词汇表为前 5 个高频单词，选取词汇表中索引号严格小于5的单词组成新的索引表
tokenizer = Tokenizer(num_words=5, oov_token="<OOV>")

tokenizer.fit_on_texts(
    texts
)  # 构建完整词汇表，出现频率大的数字被分配为更小的索引号，出现频率相同的词，谁先出现谁索引号小，而OOV默认分配为1

# 查看完整的词汇表
print("完整词汇表：", tokenizer.word_index)
print("全部计数结果:", tokenizer.word_counts)

# 将文本转为序列
sequences = tokenizer.texts_to_sequences(texts)
print("序列：", sequences)

# 将序列还原为文本
text = tokenizer.sequences_to_texts(sequences)
print("文本：", text)

# 确保赐予序列的长度一致，便于后续的embedding向量化处理
padded = pad_sequences(sequences)  # 可设置 max_len 规定矩阵长度
print("词袋化后的结果：\n", padded)

完整词汇表： {'<OOV>': 1, 'i': 2, 'programming': 3, 'is': 4, 'love': 5, 'hate': 6, 'bugs': 7, 'amazing': 8, 'c': 9, 'awesome': 10, 'but': 11, 'prefer': 12, 'python': 13}
全部计数结果: OrderedDict([('i', 3), ('love', 1), ('programming', 2), ('hate', 1), ('bugs', 1), ('is', 2), ('amazing', 1), ('c', 1), ('awesome', 1), ('but', 1), ('prefer', 1), ('python', 1)])
序列： [[2, 1, 3], [2, 1, 1], [3, 4, 1], [1, 4, 1, 1, 2, 1, 1]]
文本： ['i <OOV> programming', 'i <OOV> <OOV>', 'programming is <OOV>', '<OOV> is <OOV> <OOV> i <OOV> <OOV>']
词袋化后的结果：
 [[0 0 0 0 2 1 3]
 [0 0 0 0 2 1 1]
 [0 0 0 0 3 4 1]
 [1 4 1 1 2 1 1]]


In [None]:
sentences = ["I love NLP", "RNN is awesome"]

tokenized_sentences = [sentence.lower().split() for sentence in sentences]
print(tokenized_sentences)
print(
    sum(tokenized_sentences, [])
)  # 以空列表为起始值，从tokenized_sentences中逐个取出数字添加


[['i', 'love', 'nlp'], ['rnn', 'is', 'awesome']]
['i', 'love', 'nlp', 'rnn', 'is', 'awesome']


In [None]:
import torch
import torch.nn as nn

# 示例句子
sentences = ["I love NLP", "RNN is awesome"]

# 1. 分词和构建词典
tokenized_sentences = [sentence.lower().split() for sentence in sentences]
vocab = {
    word: idx for idx, word in enumerate(set(sum(tokenized_sentences, [])), start=1)
}
vocab["<pad>"] = 0  # 添加填充符

# 2. 转换为索引
indexed_sentences = [
    [vocab[word] for word in sentence] for sentence in tokenized_sentences
]

# 3. 填充序列（补齐至最长长度）
max_length = max(len(seq) for seq in indexed_sentences)
padded_sentences = [
    seq + [vocab["<pad>"]] * (max_length - len(seq)) for seq in indexed_sentences
]

# 转换为张量 (batch_size, seq_length)
inputs = torch.tensor(padded_sentences)
print("转化后的张量为:", inputs)

embed_size = 4  # 嵌入向量的维度
embedding = nn.Embedding(
    len(vocab), embed_size
)  # 第一个参数是告诉Embedding需要为几个单词创建词向量，后续根据每个单词的索引去匹配它们对应的词向量
embedded_inputs = embedding(
    inputs
)  # (batch_size, seq_length, embed_size)  # 对每一个token都根据embed_size进行embedding的操作，转化为纵轴是单词，横轴是向量形式
print("向量化后结果是:\n", embedded_inputs)


转化后的张量为: tensor([[2, 3, 6],
        [4, 1, 5]])
向量化后结果是:
 tensor([[[ 0.5419, -0.0278, -0.0237,  0.4598],
         [ 0.6662,  0.9331, -0.6065, -1.9131],
         [ 1.5078,  0.5253,  0.4281, -0.2084]],

        [[-0.4010,  0.4451, -0.2506, -0.2782],
         [-0.0503,  2.3237,  0.1001, -0.1376],
         [-0.6257, -0.3146, -0.3975, -1.5889]]], grad_fn=<EmbeddingBackward0>)


In [None]:
# 定义单层 RNN
hidden_size = 5  # 隐藏状态的维度
rnn = nn.RNN(
    input_size=embed_size, hidden_size=hidden_size, batch_first=True
)  # 形状为 (batch_size, seq_len, input_size)，seq_len表示一句话的时间步数

# 初始化隐藏状态 (num_layers=1, batch_size=2, hidden_size=5)，每一层RNN(年级)使用的隐藏层不一样，每个批次(科目)的数据也是不同的隐藏层
h0 = torch.zeros(1, inputs.size(0), hidden_size)
print(h0)


tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]])


In [18]:
# 前向传播
output, hn = rnn(embedded_inputs, h0)

# 打印结果
print("输出为:", output)
print("隐藏层为:", hn)

print("输入形状:", embedded_inputs.shape)  # (batch_size, seq_length, embed_size)
print("输出形状 (output):", output.shape)  # (batch_size, seq_length, hidden_size)
print("最后隐藏状态 (hn):", hn.shape)  # (num_layers, batch_size, hidden_size)


输出为: tensor([[[-0.4887,  0.4980, -0.0680, -0.0752, -0.3592],
         [-0.8153,  0.1253, -0.9064,  0.0532, -0.1003],
         [-0.1322,  0.7004, -0.4755, -0.1472, -0.5657]],

        [[-0.7067,  0.0172, -0.0317,  0.2895, -0.1139],
         [-0.6437, -0.4387, -0.6508,  0.6505, -0.6509],
         [-0.8207,  0.3801,  0.2546, -0.3612,  0.3170]]],
       grad_fn=<TransposeBackward1>)
隐藏层为: tensor([[[-0.1322,  0.7004, -0.4755, -0.1472, -0.5657],
         [-0.8207,  0.3801,  0.2546, -0.3612,  0.3170]]],
       grad_fn=<StackBackward0>)
输入形状: torch.Size([2, 3, 4])
输出形状 (output): torch.Size([2, 3, 5])
最后隐藏状态 (hn): torch.Size([1, 2, 5])


In [19]:
liner = nn.Linear(hidden_size, 2)
result = liner(output)
print(result)

tensor([[[-0.0312,  0.2926],
         [ 0.1681, -0.0372],
         [ 0.1057,  0.1251]],

        [[-0.1261,  0.3593],
         [ 0.0533,  0.1337],
         [-0.2788,  0.3478]]], grad_fn=<AddBackward0>)


In [2]:
# 多层RNN示例代码
import torch
import torch.nn as nn

# 示例句子和标签
sentences = ["I love NLP", "This movie is bad"]
labels = torch.tensor([1, 0])  # 最终句子的标签

# 分词和词典
tokenized_sentences = [sentence.lower().split() for sentence in sentences]
vocab = {
    word: idx for idx, word in enumerate(set(sum(tokenized_sentences, [])), start=1)
}
vocab["<pad>"] = 0

# 转换为索引
indexed_sentences = [
    [vocab[word] for word in sentence] for sentence in tokenized_sentences
]
max_length = max(len(seq) for seq in indexed_sentences)
padded_sentences = [seq + [0] * (max_length - len(seq)) for seq in indexed_sentences]
inputs = torch.tensor(padded_sentences)  # Shape: (batch_size, seq_length)


# 定义多层 RNN 模型
class MultiLayerRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(
            embed_size, hidden_size, num_layers=num_layers, batch_first=True
        )
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # 嵌入层
        output, hidden = self.rnn(x)  # RNN 输出
        print(f"output的值是:{output}")
        print(f"hidden的值是:{hidden}")
        out = self.fc(hidden[-1])  # 取最后一层的隐藏状态作为分类输入
        return out


# 模型初始化
vocab_size = len(vocab)  # 8
embed_size = 8
hidden_size = 16
num_classes = 2
num_layers = 3  # 三层 RNN

model = MultiLayerRNN(vocab_size, embed_size, hidden_size, num_classes, num_layers)

# 前向传播
outputs = model(inputs)
print("模型输出 (分类):", outputs)


output的值是:tensor([[[-0.6493,  0.1630,  0.0098, -0.1999, -0.1375,  0.1564, -0.1979,
           0.1462,  0.1414, -0.1055,  0.0123,  0.2600,  0.2173, -0.0925,
          -0.3207,  0.1899],
         [-0.5886,  0.1960,  0.2016, -0.1176, -0.1485, -0.1782, -0.2423,
           0.1649,  0.2419, -0.1088, -0.1592,  0.2728,  0.3155,  0.0618,
          -0.5090,  0.2603],
         [-0.6759,  0.3456,  0.3108, -0.4451,  0.0052,  0.5187, -0.2579,
           0.2178,  0.4540, -0.1011, -0.3092,  0.4792,  0.3551,  0.1834,
          -0.2979, -0.2227],
         [-0.5810,  0.4812,  0.1900, -0.3839,  0.0947,  0.3237, -0.5739,
           0.3276,  0.2959,  0.0030, -0.1377,  0.4063,  0.5563,  0.1789,
          -0.2867, -0.2339]],

        [[-0.5326,  0.1938,  0.0674, -0.1837, -0.1406,  0.0770, -0.0851,
           0.0753, -0.0088, -0.2840, -0.1261,  0.2203,  0.2584, -0.0847,
          -0.3063,  0.3108],
         [-0.6541,  0.1903,  0.2583, -0.2384, -0.1740,  0.0144, -0.3644,
           0.2328,  0.3021, -0.1352, -0.

In [11]:
import torch
import torch.nn as nn

# 示例句子
sentences = ["I love NLP", "This movie is bad", "I hate talking"]
tokenized_sentences = [sentence.lower().split() for sentence in sentences]
vocab = {
    word: idx for idx, word in enumerate(set(sum(tokenized_sentences, [])), start=1)
}
vocab["<pad>"] = 0

# 转换为索引
indexed_sentences = [
    [vocab[word] for word in sentence] for sentence in tokenized_sentences
]
max_length = max(
    len(seq) for seq in indexed_sentences
)  # 求出列表最大长度->进行填补操作
padded_sentences = [
    seq + [0] * (max_length - len(seq)) for seq in indexed_sentences
]  # 使用填充法，将全部的seq_length转化为一致
inputs = torch.tensor(padded_sentences)  # Shape: (batch_size, seq_length)

print("Token化后的形式为:\n", tokenized_sentences)
print("字符字典为:\n", vocab)
print("转化为索引矩阵:\n", indexed_sentences)
print("输入为:\n", inputs)

# 嵌入层
embed_size = 4  # 嵌入向量的维度
embedding = nn.Embedding(len(vocab), embed_size)
embedded_inputs = embedding(inputs)  # (batch_size, seq_length, embed_size)
print("embedding化后的结果是:\n", embedded_inputs)

Token化后的形式为:
 [['i', 'love', 'nlp'], ['this', 'movie', 'is', 'bad'], ['i', 'hate', 'talking']]
字符字典为:
 {'love': 1, 'i': 2, 'nlp': 3, 'talking': 4, 'movie': 5, 'bad': 6, 'hate': 7, 'this': 8, 'is': 9, '<pad>': 0}
转化为索引矩阵:
 [[2, 1, 3], [8, 5, 9, 6], [2, 7, 4]]
输入为:
 tensor([[2, 1, 3, 0],
        [8, 5, 9, 6],
        [2, 7, 4, 0]])
embedding化后的结果是:
 tensor([[[ 0.1061,  0.8229,  0.4422, -0.1745],
         [ 0.8715,  0.1458, -0.8946,  0.6747],
         [-1.2595,  1.6699, -0.1777,  2.6016],
         [ 0.8748, -0.6086, -0.2546,  1.2928]],

        [[ 0.3344,  0.5769,  0.2866, -1.2785],
         [ 0.0943, -0.7924,  1.3047, -0.7229],
         [ 0.4095,  1.9974, -0.4329, -0.8676],
         [-0.4056,  0.8265, -1.7672, -0.5232]],

        [[ 0.1061,  0.8229,  0.4422, -0.1745],
         [ 0.8379, -1.0641, -0.0122, -0.8653],
         [ 1.1144,  1.1977, -0.4135,  1.0867],
         [ 0.8748, -0.6086, -0.2546,  1.2928]]], grad_fn=<EmbeddingBackward0>)
