In [None]:
# 导入本节所需要的模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import time
import copy
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchtext import data
from torchtext.vocab import Vectors


In [None]:
# 定义文本切分方法，直接使用空格切分即可
mytokenize=lambda x: x.split()
TEXT=data.Field(sequential=True,tokenize=mytokenize,include_lengths=True,use_vocab=True,
batch_first=True,fix_length=200)
LABEL=data.Field(sequential=False,use_vocab=False,
pad_token=None,unk_token=None)
# 对所要读取的数据集和列进行处理
train_test_fields=[
    ("label",LABEL),# 对标签的操作
    ("text",TEXT)# 对文本的操作
    ]
# 读取数据
traindata,testdata=data.TabularDataset.split(
    path="./data/chap6",
    format="csv",
    train="imdb_train.csv",
    fields=train_test_fields,
    test="imdb_test.csv",
    skip_header=True
)

In [None]:
# Vector导入预训练好的词向量文件
vec=Vectors("glove.6B.100d.txt","./data")
# 使用训练集构建单词表，导入预先训练的词嵌入
TEXT.build_vocab(traindata,max_size=20000,vectors=vec)
LABEL.build_vocab(traindata)
# 训练集，验证集和测试集定义为加载器
BATCH_SIZE=32
train_iter=data.BucketIterator(traindata,batch_size=BATCH_SIZE)
test_iter=data.BucketIterator(testdata,batch_size=BATCH_SIZE)

In [None]:
class GRUNet(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim):
        """
        vocab_size:词典长度
        embedding_dim:词向量的维度
        hidden_dim:GRU神经元个数
        layer_dim:GRU的层数
        output_dim:隐藏层1输出的维度(分类的数量)
        """
        super(GRUNet,self).__init__()
        self.hidden_dim=hidden_dim# GRU神经元个数
        self.layer_dim=layer_dim# GRU的层数
        # 对文本进行词向量处理
        self.embedding=nn.Embedding(vocab_size,embedding_dim)
        # GRU+全连接层
        self.gru=nn.GRU(embedding_dim,hidden_dim,layer_dim,batch_first=True)
        self.fc1=nn.Sequential(
            nn.Linear(hidden_dim,hidden_dim),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(hidden_dim,output_dim)
        )
    def forward(self,x):
        embeds=self.embedding(x)
        # r_out shape (batch,time_step,output_size)
        # h_n shape (n_layers,batch,hidden_size)
        r_out,h_n=self.gru(embeds,None)# None 表示初始的hidden state 为0
        # 选取最后的一个时间点的out输出
        out=self.fc1(r_out[:,-1,:])
        return out

In [None]:
# 初始化网络
vocab_size=len(TEXT.vocab)
embedding_dim=vec.dim # 词向量的维度
hidden_dim=128
layer_dim=1
output_dim=2
grumodel=GRUNet(vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim)
grumodel

In [None]:
# 将导入的词向量作为embedding.weight的初始值
grumodel.embedding.weight.data.copy(TEXT.vocab.vectors)
# 将无法识别的词'<unk>','<pad>'的向量初始化为0
UNK_IDX=TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX=TEXT.vocab.stoi[TEXT.pad_token]
grumodel.embedding.weight.data[UNK_IDX]=torch.zeros(vec.dim)
grumodel.embedding.weight.data[PAD_IDX]=torch.zeros(vec.dim)



In [None]:
# 定义网络的训练过程函数
def train_model(model,traindataloader,testdataloader,criterion,optimizer,num_epochs=25):
    """
    model:网络模型,traindataloader:训练数据集,testdataloader:测试数据集
    criterion:损失函数,optimizer:优化方法
    num_epoch:训练的轮数
    """
    train_loss_all=[]
    train_acc_all=[]
    test_loss_all=[]
    test_acc_all=[]
    learn_rate=[]
    since=time.time()
    # 设置等间距调整学习率，每隔step_size个epoch，学习率缩小到原来的1/10
    scheduler=optim.lr_scheduler.StepLR(optimizer,step_size=5,gamma=0.1)
    for epoch in range(num_epochs):
        learn_rate.append(scheduler.get_lr()[0])
        print('-'*10)
        print('Epoch {}/{} ,Lr:{} '.format(epoch,num_epochs-1,learn_rate[-1]))
        # 每个epoch有两个阶段:训练阶段和验证阶段
        train_loss=0.0
        train_corrects=0.0
        train_num=0
        test_loss=0.0
        test_corrects=0
        test_num=0
        model.train()# 设置模型为训练模型
        for step,batch in enumerate(traindataloader):
            textdata,target=batch.text[0],batch.label
            out=model(textdata)
            pre_lab=torch.argmax(out,1)# 预测的标签
            loss=criterion(out,target) # 计算损失函数值
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss+=loss.item()*len(target)
            train_corrects+=torch.sum(pre_lab==target.data)
            train_num+=len(target)
    # 计算一个epoch在训练集上的损失和精度
    train_loss_all.append(train_loss/train_num)
    train_acc_all.append(train_corrects.double().item()/train_num)
    print('{} Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch,train_loss_all[-1],train_acc_all[-1]))
    scheduler.step()# 更新学习率
    # 计算一个epoch在验证集上的损失和精度
    model.eval()
    # 设置模型为评估模型
    for step,batch in enumerate(testdataloader):
        textdata,target=batch.text[0],batch.label
        out=model(textdata)
        pre_lab=torch.argmax(out,1)
        loss=criterion(out,target)
        test_loss+=loss.item()*len(target)
        test_corrects+=torch.sum(pre_lab==target.data)
        test_num+=len(target)
    # 计算一个epoch在训练集的损失和精度
    test_loss_all.append(test_loss/test_num)
    test_acc_all.append(test_corrects.double().item()/test_num)
    print('{} Test Loss: {:.4f} Test Acc: {:.4f}'.format(epoch,test_loss_all[-1],test_acc_all[-1]))
    train_process=pd.DataFrame(
        data={
            "epoch":range(num_epochs),
            "train_loss_all":test_loss_all,
            "train_acc_all":train_acc_all,
            "test_loss_all":test_loss_all,
            "learn_rate":learn_rate
        })
    return model,train_process     

In [None]:
# 定义优化器
optimizer=optim.RMSprop(grumodel.parameters(),lr=0.003)
loss_func=nn.CrossEntropyLoss()# 交叉熵作为损失函数
# 对模型进行迭代训练，对所有的数据训练10轮
grumodel,train_process=train_model(
    grumodel,train_iter,test_iter,loss_func,optimizer,num_epochs=10)


In [None]:
# 可视化模型训练过程
plt.figure(figsize=(18,6))
plt.subplot(1,2,1)
plt.plot(train_process.epoch,train_process.train_loss_all,"r.-",label="Train loss")
plt.plot(train_process.epoch,train_process.test_loss_all,"bs-",label="Test loss")
plt.legend()
plt.xlabel("Epoch number",size=13)
plt.ylabel("Loss value",size=13)
plt.subplot(1,2,2)
plt.plot(train_process.epoch,train_process.train_acc_all,"r.-",label="Train acc")
plt.plot(train_process.epoch,train_process.test_acc_all,"bs-",label="Test acc")
plt.xlabel("Epoch number",size=13)
plt.ylabel("Acc",size=13)
plt.legend()
plt.show()

In [None]:
# 对测试集进行预测并计算精度
grumodel.eval()# 设置模型为评估模式
test_y_all=torch.LongTensor()
pre_lab_all=torch.LongTensor()
for step,batch in enumerate(test_iter):
    textdata,target=batch.text[0],batch.label.view(-1)
    out=grumodel(textdata)
    pre_lab=torch.argmax(out,1)
    test_y_all=torch.cat((test_y_all,target))# 测试集的标签
    pre_lab_all=torch.cat((pre_lab_all,pre_lab))# 测试集的预测标签
acc=accuracy_score(test_y_all,pre_lab_all)
print("在测试集上的预测精度为:",acc)



