### 实现流程：

#### 1. 读取原始数据集（文本集）

#### 2. 文本预处理
* **2.1 清理无用的标点符号**
* **2.2 根据 换行符 \n 分割**
* **2.3 单词 --> 索引 转换**
* **2.4 标签 --> 1， 0 转换**
* **2.5 清理文本太短以及过长的样本**
* **2.6 将单词映射为整型**
* **2.7 设定统一的文本长度，对整个文本数据中的每条评论进行填充或截断**

#### 3. 特征工程
* **3.1 array --> tensor**
* **3.2 将数据集分离成：train, val, test 三部分，比例是： 0.8, 0.1, 0.1**
* **3.3 通过DataLoader按批处理数据**

#### 4. 定义网络模型结构

#### 5. 定义超参数

#### 6. 定义训练函数（训练 + 验证）

#### 7. 定义测试函数

#### 8. 定义预测函数


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 1. 加载文本和标签数据

In [3]:
# 读取文本数据
with open("reviews.txt", 'r') as file:
    text = file.read()

In [4]:
len(text) # 共33678267个字符

In [5]:
type(text) # 类型

In [6]:
text[:10] # 显示前10个字符

In [7]:
# 读取标签数据
with open('labels.txt', 'r') as file:
    labels = file.read()

In [8]:
len(labels) # 共225000个字符

In [8]:
type(labels) # 类型

In [9]:
labels[:10] # 显示前10个字符

### 2 数据 EDA

In [9]:
# 2.1 清理无用的标点符号
from string import punctuation

print("标点符号 : ", punctuation)

In [10]:
clean_text = ''.join([char for char in text if char not in punctuation]) # 遍历文本中每一个字符，跳过标点符合

In [12]:
len(clean_text) # 新的文本字符个数

In [11]:
# 2.2 根据 换行符 \n 分割
clean_text = clean_text.split('\n')

In [12]:
len(clean_text)

In [15]:
clean_text[0]

In [13]:
# 标签 根据 \n 分割
labels = labels.split('\n')

len(labels)

In [14]:
labels[:5]

In [15]:
# 2.3 字典： 单词 --> 索引

# 获取所有评论中的每个单词
words = [word.lower() for sentence in clean_text for word in sentence.split(' ')]

In [16]:
words[:10] # 显示前10个单词

In [17]:
various_words = list(set(words)) # 筛选出所有评论中不同的单词

In [18]:
various_words.remove('') # 清理空字符

In [19]:
len(various_words) # 不同的单词个数

In [20]:
# 创建字典，格式： 单词 ： 整数

int_word = dict(enumerate(various_words, 1))

In [21]:
int_word

In [22]:
# 字典，格式： 整数 ： 单词
word_int = {w:int(i) for i, w in int_word.items()}

In [23]:
word_int

In [24]:
# 2.4 标签 --> 1， 0 转换
# positive : 1,  negative : 0

label_int = np.array([1 if x == 'positive' else 0 for x in labels])

In [25]:
len(label_int)

In [26]:
from collections import Counter

Counter(label_int)

In [27]:
# 2.5 清理文本太短以及过长的样本

# 统计文本中，每条评论的长度
sentence_length = [len(sentence.split()) for sentence in clean_text]

In [31]:
counts = Counter(sentence_length) # 统计不同长度的评论

In [32]:
# 最小评论长度
min_sen = min(sorted(counts.items()))

In [33]:
min_sen

In [34]:
# 最大评论长度
max_sen = max(sorted(counts.items()))

In [35]:
max_sen

In [36]:
# 获取 min 和 max 对应的索引

min_index = [i for i, length in enumerate(sentence_length) if length == min_sen[0]]

max_index = [i for i, length in enumerate(sentence_length) if length == max_sen[0]]

In [37]:
min_index

In [38]:
max_index

In [39]:
# 根据索引删除文本中过短或过长的评论

new_text = np.delete(clean_text, min_index)

print("原始文本数量： ", len(clean_text))
print("新文本数量: ", len(new_text))

In [40]:
new_text2 = np.delete(new_text, max_index)

print("原始文本数量： ", len(new_text))
print("新文本数量: ", len(new_text2))

In [41]:
# 同样需要在标签集中根据索引删除对应的标签

new_labels = np.delete(label_int, min_index)

new_labels = np.delete(new_labels, max_index)

print("原始标签数量： ", len(label_int))
print("新标签数量： ", len(new_labels))

In [42]:
new_text2[0]

In [43]:
# 2.6 将单词映射为整型

text_ints = []
for sentence in new_text2:
    sample = list()
    for word in sentence.split():
        int_value = word_int[word] # 获取到单词对应的键
        sample.append(int_value)
    text_ints.append(sample)

In [44]:
text_ints[0] # 第一条评论

In [45]:
len(text_ints) # 总的评论数

In [46]:
# 2.7 设定统一的文本长度，对整个文本数据中的每条评论进行填充或截断
# 设定每条评论固定长度为200个单词，不足的评论用0填充，超过的直接截断

def reset_text(text, seq_len):
    dataset = np.zeros((len(text), seq_len))
    for index, sentence in enumerate(text):
        if len(sentence) < seq_len:
            dataset[index, :len(sentence)] = sentence
        else:
            dataset[index, :] = sentence[:seq_len] # 截断
            
    return dataset

In [47]:
dataset = reset_text(text_ints, seq_len=200)

In [48]:
dataset.shape

In [49]:
dataset[0,:]

### 3 数据类型转换

In [50]:
type(dataset)

In [51]:
type(label_int)

In [52]:
import torch
import torch.nn as nn

# 3.1 数据类型转换
dataset_tensor = torch.from_numpy(dataset)
label_tensor = torch.from_numpy(new_labels)

In [53]:
dataset_tensor.shape

In [54]:
label_tensor.shape

In [55]:
# 3.2 数据分割，train, val, test

# 总样本数
all_samples = len(dataset_tensor)
print("总样本数：",all_samples)

# 设置比例
ratio = 0.8
train_size = int(all_samples * 0.8) # 训练样本数
print("训练样本数：",train_size)

rest_size = all_samples - train_size # 剩余样本数

val_size = int(rest_size * 0.5) # 验证样本数
print("验证样本数：", val_size)

test_size = int(rest_size * 0.5) # 测试样本数
print("测试样本数：", test_size)

In [56]:
# 获取train, val, test 样本

# train
train = dataset_tensor[:train_size]
train_labels = label_tensor[:train_size]

In [57]:
train.shape

In [58]:
train_labels.shape

In [59]:
# 剩余样本
rest_samples = dataset_tensor[train_size:]
rest_labels = label_tensor[train_size:]

In [60]:
# val
val = rest_samples[:val_size]
val_labels = rest_labels[:val_size]

In [61]:
val.shape

In [62]:
val_labels.shape

In [63]:
# test
test = rest_samples[val_size:]
test_labels = rest_labels[val_size:]

In [64]:
test.shape

In [65]:
test_labels.shape

In [66]:
# 3.3 通过DataLoader按批处理数据
from torch.utils.data import TensorDataset, DataLoader

# 对数据进行封装：(评论，标签)
train_dataset = TensorDataset(train, train_labels)
val_dataset = TensorDataset(val, val_labels)
test_dataset = TensorDataset(test, test_labels)

batch_size = 128
# 批处理
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True, drop_last=True)

In [67]:
# 获取train中的一批数据
data, label = next(iter(train_loader))

In [68]:
data.shape

In [69]:
label.shape

In [70]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

device

### 4. 定义网络模型结构

In [71]:
class sentiment(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_dim, output_size, num_layers, dropout=0.5):
        super(sentiment, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size, embedding_dim) # 词嵌入层
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        '''
        x shape : (batch_size, seq_len, features)
        
        '''
        batch_size = x.size(0) # 获取batch_size
        x = x.long() # 类型转换
        #print('x shape : ', x.shape) # torch.Size([128, 200])
        embeds = self.embedding(x) # 词嵌入表示 
        #print('embeds shape : ', embeds.shape) # torch.Size([128, 200, 300])
        out, hidden = self.lstm(embeds, hidden) # lstm out shape : (batch_size, seq_len, hidden_dim)
        #print('out_1 shape : ', out.shape) # torch.Size([128, 200, 256])
        #print('hidden_0 shape : ', hidden[0].shape) # torch.Size([2, 128, 256])
        #print('hidden_1 shape : ', hidden[1].shape) # torch.Size([2, 128, 256])
        out = out.reshape(-1, self.hidden_dim) # （batch_size * seq_len, hidden_dim）
        #print('out_2 shape : ', out.shape) # torch.Size([25600, 256])
        out = self.linear(out) # 全连接层 
        #print('out_3 shape : ', out.shape) # torch.Size([25600, 1])
        sigmoid_out = self.sigmoid(out) #
        #print('sigmoid_out_1 shape : ', sigmoid_out.shape) # torch.Size([25600, 1])
        sigmoid_out = sigmoid_out.reshape(batch_size, -1)
        #print('sigmoid_out_2 shape : ', sigmoid_out.shape) # torch.Size([128, 200])
        sigmoid_out = sigmoid_out[:, -1] # 获取最后一批的标签
        #print('sigmoid_out_3 shape : ', sigmoid_out.shape) # torch.Size([128])
        return sigmoid_out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        #print("weghit :", weight.shape) # torch.Size([74073, 300])
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device),
                weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [72]:
# 初始化超参数
input_size = len(word_int) + 1 # 输入（不同的单词个数）
output_size = 1 # 输出
embedding_dim = 400 # 词嵌入维度
hidden_dim = 128 # 隐藏层节点个数
num_layers = 2 # lstm的层数

In [73]:
# 创建模型
model = sentiment(input_size, embedding_dim, hidden_dim, output_size, num_layers)

model

In [74]:
criterion = torch.nn.BCELoss() # 损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # 优化器
num_epochs = 50 # 循环次数

In [75]:
model = model.to(device)

In [76]:
# 定义训练模型
def train(model, device, data_loader, criterion, optimizer, num_epochs, val_loader):
    history = list()
    for epoch in range(num_epochs):
        hs = model.init_hidden(batch_size)
        train_loss = []
        train_correct = 0.0
        model.train()
        for data, target in data_loader:
            data = data.to(device) # 部署到device
            target = target.to(device)
            optimizer.zero_grad() # 梯度置零
            output, hs = model(data, hs) # 模型训练
            hs = tuple([h.data for h in hs])
            #print('output shape : ', output.shape) # torch.Size([128])
            loss = criterion(output, target.float()) # 计算损失
            train_loss.append(loss.item()) # 累计损失
            loss.backward() # 反向传播
            optimizer.step() # 参数更新
            train_correct += torch.sum(output==target) # 比较
            
        # 模型验证
        model.eval()
        hs = model.init_hidden(batch_size)
        val_loss = []
        val_correct = 0.0
        with torch.no_grad():
            for data, target in val_loader:
                data = data.to(device)
                target = target.to(device)
                preds, hs = model(data, hs) # 验证
                hs = tuple([h.data for h in hs])
                loss = criterion(preds, target.float()) # 计算损失
                val_loss.append(loss.item()) # 累计损失
                val_correct += torch.sum(preds==target) # 比较
#             history['val_loss'].append(np.mean(val_loss))
#             history['val_correct'].append(np.mean(val_correct))
#         history['train_loss'].append(np.mean(train_loss))
#         history['train_correct'].append(np.mean(train_correct))
        print(f'Epoch {epoch}/{num_epochs} --- train loss {np.round(np.mean(train_loss), 5)} --- val loss {np.round(np.mean(val_loss),5)}')

In [77]:
train(model, device, train_loader, criterion, optimizer, num_epochs, val_loader)

In [78]:
# 测试

def test(model, data_loader, device, criterion):
    test_losses = []
    num_correct = 0
    # 初始化隐藏状态
    hs = model.init_hidden(batch_size)
    model.eval()
    for i, dataset in enumerate(data_loader):
        data = dataset[0].to(device) # 部署到device
        target = dataset[1].to(device)
        output, hs = model(data, hs) # 测试
        loss = criterion(output, target.float()) # 计算损失
        pred = torch.round(output) # 将预测值进行四舍五入，转换为0 或 1
        test_losses.append(loss.item()) # 保存损失
        correct_tensor = pred.eq(target.float().view_as(pred)) # 返回一堆True 或 False
        correct = correct_tensor.cpu().numpy()
        result = np.sum(correct)
        num_correct += result
        #print("num correct : ", num_correct)
        print(f'Batch {i}')
        print(f'loss : {np.round(np.mean(loss.item()), 3)}')
        print(f'accuracy : {np.round(result / len(data), 3) * 100} %')
        print()
    print("总的测试损失 test loss : {:.2f}".format(np.mean(test_losses)))
    print("总的测试准确率 test accuracy : {:.2f}".format(np.mean(num_correct / len(data_loader.dataset))))

In [79]:
test(model, test_loader, device, criterion)

### 预测（测试）

In [80]:
# 案例1
text = 'this movie is so amazing. the plot is attractive. and I really like it.'

In [81]:
# 第一步：文本转索引（整数）
from string import punctuation

def converts(text):
    # 去除标点符号
    new_text = ''.join([char for char in text if char not in punctuation])
    print("new text :\n", new_text)
    # 文本映射为索引
    text_ints = [word_int[word.lower()] for word in new_text.split()]
    print("文本映射为索引：\n", text_ints)
    return text_ints

In [82]:
text_ints = converts(text)

In [83]:
text_ints

In [84]:
# 文本对齐，sequence_length = 200
new_text_ints = reset_text([text_ints], seq_len=200) # 注意这里要添加一个[]，因为，reset_text处理的二维数据

In [85]:
new_text_ints

In [86]:
new_text_ints.shape

In [87]:
# numpy --> tensor
text_tensor = torch.from_numpy(new_text_ints)

print(text_tensor.shape)

In [88]:
# 定义预测函数
def predict(model, text_tensor, device):
    batch_size = text_tensor.size(0) # 这里是1
    hs = model.init_hidden(batch_size) # 初始化隐藏状态
    text_tensor = text_tensor.to(device)
    pred, hs = model(text_tensor, hs) # 判断
    print("概率值：", pred.item())
    # 将pred概率值转换为0或1
    pred = torch.round(pred)
    print("类别值：", pred.item())
    # 判断
    if pred.data == 1:
        print("评论正面")
    else:
        print("评论反面")

In [89]:
predict(model, text_tensor, device)