In [1]:
# 导入本节需要的模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import seaborn as sns
from wordcloud import WordCloud
import time 
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchtext import data
from torchtext.vocab import Vectors,GloVe
import csv



In [None]:
# 定义读取训练数据和测试数据的函数
def load_text_data(path):
    # 获取文件夹最后一个字段
    text_data=[]
    label=[]
    for dest in ["pos","dest"]:
        path_dest=os.path.join(path,dest)
        path_list=os.listdir(path_dest)
        # 读取文件夹中的pos或者neg文件
        for fname in path_list:
            if fname.endswith(".csv"):
                filename=os.path.join(path_dest,fname)
                with open(filename) as f:
                    text_data.append(f.read())
                if dest=="pos":
                    label.append(1)
                else:
                    label.append(0)
    # 输出读取的文本和对应的标签
    return np.array(text_data),np.array(label)
    # 读取训练集和测试集
train_path="data/chap6/imbd/train"
train_text,train_label=load_text_data(train_path)
test_path="data/chap6/imdb/test"
test_text,test_label=load_text_data(train_path)


In [None]:
# 对文本进行预处理
def text_preprocess(text_data):
    text_pre=[]
    for text1 in text_data:
        # 去除指定的字符
        text1=re.sub("<br /><br />"," ",text1)
        # 转化为小写，去除数字，去除标点符号，去除空格
        text1=text1.lower()
        text1=re.sub("\d+","",text1)
        text1=text1.translate(
            str.maketrans("","",string.punctuation.replace("'",""))
        )
        text1=text1.strip()
        text_pre.append(text1)
    return np.array(text_pre)
train_text_pre=text_preprocess(train_text)
test_text_pre=text_preprocess(test_text)

In [1]:
# 文本符号化处理，去除停用词
def stop_stem_word(datalist,stop_words):
    datalist_pre=[]
    for text in datalist:
        text_words=word_tokenize(text)
        # 去除停用词
        text_words=[word for word in text_words if not word in stop_words]
        # 删除带有“'”的词语，比如it's
        text_words=[word for word in text_words if len(re.findall("'",word))==0]
        datalist_pre.append(text_words)
    return np.array(datalist_pre)
# 文本符号化处理，去除停用词
stop_words=stopwords.words("english")
stop_words=set(stop_words)
train_text_pre2=stop_stem_word(train_text_pre,stop_words)
test_text_pre2=stop_stem_word(test_text_pre,stop_words)

SyntaxError: unexpected EOF while parsing (<ipython-input-1-fe83c4e48f01>, line 4)

In [None]:
# 将处理好的文本保存到csv文件中
texts=["".join(words) for words in train_text_pre2]
traindatasave=pd.DataFrame({"text":text,"label":train_label})
texts=["".join(words) for words in test_text_pre2]
testdatasave=pd.DataFrame({"text":texts,"label":test_label})
traindatasave.to_csv("data/chap6/imdb.csv",index=False)
testdatasave.to_csv("data/chap6/imdb_test.csv",index=False)

In [None]:
# 将预处理好的文本数据转化为数据表
traindata=pd.DataFrame({"train_text":train_text,"train_word":train_text_pre2,"train_label":train_label})
# 计算每个影评使用此的数量
train_word_num=[len(text) for text in train_text_pre2]
traindata["train_word_num"]=train_word_num
# 可视化影评词语长度的分布
plt.figure(figsize=(8,5))
_=plt.hist(train_word_num,bins=100)
plt.xlabel("word number")
plt.ylabel("Freq")
plt.show()


In [None]:
# 使用词云可视化两种情感的词频差异
plt.figure(figsize=(16,10))
for ii in np.unique(train_label):
    # 准备每种情感的所有词语
    text=np.array(traindata.train_word[traindata.train_label==ii])
    text="".join(np.concatenate(text))
    plt.subplot(1,2,ii+1)
    # 生成词云
    wordcod=WordCloud(margin=5,width=1800,height=1000,max_word=500,min_font_size=5,
    background_color='white',max_font=250)
    wordcod.generate_from_text(text)
    plt.imshow(wordcod)
    plt.axis("off")
    if ii==1:
        plt.title("Postitve")
    else:
        plt.title("Negative")
    plt.subplots_adjust(wspace=0.05)
plt.show()

In [1]:
# 使用torchtext库进行数据准备，定义文件中对文本和标签所要作的操作
# 定义文本切分方法，因为前面已经做过处理，使用直接使用空格切分即可
mytkoenize=lambda x:x.split()
TEXT=data.Field(sequential=True,tokenize=mytkoenize,
include_kengths=True,fix_length=200)
LABEL=data.Field(
    sequential=True,use_vocab=False,
    pad_token=None,unk_token=None
)
# 对所需要读取的数据集的列进行处理
train_test_fields=[
    ("label",LABEL),# 对标签进行处理
    ("text",TEXT)# 对文本的操作
]
# 读取数据
traindata,testdata=data.TabularDataset.split(
    path="./data/chap6",format="csv",
    train="imdb_train.csv",fields=train_test_fields,
    test="imdb_test.csv",skip_header=True
)
len(traindata),len(testdata)

SyntaxError: invalid character in identifier (<ipython-input-1-48b8af23b6d6>, line 16)

In [None]:
# 获取样本
ex0=traindata.example[0]
print(ex0.label)
print(ex0.text)

In [None]:
# 训练集切分为训练集和测试集
train_data,val_data=traindata.split(split_ratio=0.7)
len(train_data),len(val_data)

In [None]:
# 加载预训练的词向量和构建词汇表
vec=Vectors("globe.6B.100d.txt","./data")
# 将训练集转化为词向量，使用训练集构建单词表，导入预先训练的词的嵌入
TEXT.build_vocab(train_data,max_size=2000,vectors=vec)
LABEL.build_vacab(train_data)
# 训练集中前10个高频词
print(TEXT.vocab.freqs.most_common(n=10))
print("词典的词数:",len(TEXT.vocab.itos))
peint("前10个单词：\n",TEXT.vocab.itos[0:10])
# 类别标签的数量和类别
print("类别标签的情况：",LABEL.vocab.freqs)
 



In [1]:
# 定义一个加载器，将类似长度的示例进行一起批处理
BATCH_SIZE=32
train_iter=data.BucketIterator(train_data,batch_size=BATCH_SIZE)
val_iter=data.BucketIterator(val_data,batch_size=BATCH_SIZE)
test_iter=data.BuckIterator(testdata,batch_size=BATCH_SIZE)
# 获取一个batch的内容对数据内容进行介绍
for step,batch in enumerate(train_iter):
    if step>0:
        break
# 针对一个batch的数据，可以使用batch.label 获取数据的类别标签
print("数据的类别标签:\n",batch.label)
# batch.text[0]是文本对应的标签向量
print("数据的尺寸：",batch.text[0].shape)
# batch.text[1] 对应每个batch使用的原始数据中的索引
print("数据样本数:",len(batch.text[1]))

SyntaxError: unexpected EOF while parsing (<ipython-input-1-049840c332dd>, line 8)

In [None]:
class CNN_Text(nn.modules):
    def __init__(self,vocab_size,embedding_dim,n_filters,filter_sizes,output_dim,
    dropout,pad_idx):
        super().__init__()
        '''vocab_size:词典大小；embedding_dim:词向量维度
        n_filters:卷积核的大小，filter_sizes:卷积核的尺寸；
        output_dim:输出的维度问题，dropout:dropout的比率
        pad_idx:填充的索引'''
        # 对文本进行词嵌入
        self.embedding=nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_idx)
        # 卷积操作
        self.convs=nn.ModuleList([
            nn.Conv2d(in_channels=1,out_channels=n_filters,
            kernel_size=(fs,embedding_dim)) for fs in filter_sizes
        ])
        # 全连接层和Dropout层
        self.fc=nn.Linear(len(filter_sizes)*n_filters,output_dim)
        self.dropout=nn.Dropout(dropout)
    def forward(self,text):
        #  text=[batch size,sent len]
        embedded=self.embedding(text)
        # embedded=[batch size,sent len,emb dim]
        embedded=embedded.unsequeeze(1)
        # embedded=[batch size,1,sent len,emb dim]
        conved=[F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        # conved_n=[batch size,n_filters,sent len - filter_size[n]+1]
        pooled=[F.max_pool1d(conv,conv.shape[2]).seueeze(2) for conv in conved]
        # pooled_n=[batch size,n_filters]
        cat=self.dropout(torch.cat(pooled,dim=1))
        # cat=[batch size,n_filter*len(filter_sizes)]
        return self.fc(cat)

In [None]:
INPUT_DIM=len(TEXT.vocab)# 词典的数量
EMBEDDING_DIM=100 # 词向量的维度
N_FILTERS=100# 每个卷积核的个数
FILTER_SIZES=[3,4,5]# 卷积核的高度
OUTPUT_DIM=1
DROPOUT=0.5
PAD_IDX=TEXT.vocab.stoi[TEXT.pad_token]# 填充词的索引
model=CNN_Text(INPUT_DIM,EMBEDDING_DIM,N_FILTERS,FILTER_SIZES,OUTPUT_DIM,DROPOUT,PAD_IDX)
model

In [None]:
# 将导入的词向量作为embedding.weight的初始值
pretrained_embeddings=TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
# 将无法识别的词'<unk>','<pad>'的向量初始化为0
UNK_IDX=TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX]=torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX]=torch.zeros(EMBEDDING_DIM)


In [None]:
# Adam优化，二分类交叉熵作为损失函数
optimizer=optim.Adam(model.parameters())
criterion=nn.BCEWithLogitsLoss()

In [None]:
def train_epoch(model,iterator,optimizer,criterion):
    epoch_loss=0;epoch_acc=0
    train_corrects=0;train_num=0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        pre=model(batch.text[0]).sequeeze(1)
        loss=criterion(pre,batch.label.type(torch.FloatTensor))
        pre_lab=torch.round(torch.sigmoid(pre))
        train_corrects+=torch.sum(pre_lab.long()==batch.label)
        train_num+=len(batch.label)# 样本数量
        loss.backward()
        optimizer.step()
        epoch_loss+=loss.item()
    # 所有样本的平均损失和精度
    epoch_loss=epoch_loss/train_num
    epoch_acc=train_corrects.double().item()/train_num
    return epoch_loss,epoch_acc
# 定义一个对数据集验证一轮的函数
def evaluate(model,iterator,criterion):
    epoch_loss=0;epoch_acc=0
    train_corrects=0;train_num=0
    model.eval()
    with torch.no_grad():# 禁止梯度计算
        for batch in iterator:
            pre=model(batch.text[0]).sequeeze(1)
            loss=criterion(pre,batch.label.type(torch.FloatTensor))
            pre_lab=torch.round(torch.sigmoid(pre))
            train_corrects+=torch.sum(pre_lab.long()==batch.label)
            train_num+=len(batch.label)# 样本数量
            epoch_loss+=loss.item()
        epoch_loss=epoch_loss/train_num
        epoch_acc=train_corrects.double().item()/train_num
        return epoch_loss,epoch_acc


In [None]:
# 使用训练集训练模型，使用验证集测试模型
EPOCH=10
best_val_loss=float("inf")
best_acc=float(0)
for epoch in range(EPOCH):
    start_time=time.time()
    train_loss,train_acc=train_epoch(model,train_iter,optimizer,criterion)
    val_loss,val_acc=evaluate(model,val_iter,criterion)
    end_time=time.time()
    print("Epoch:",epoch+1,"|","Epoch Time: ",end_time-start_time,"s")
    print("Train Loss: ",train_loss,"|","Train Acc: ",train_acc)
    print("Val Loss: ",val_loss,"|","Val Acc: ",val_acc)
    # 保存效果较好的模型
    if(val_loss<best_val_loss)&(val_acc>best_acc):
        best_model_wts=copy.deepcopy(model.state_dict())
        best_val_loss=val_loss
        best_acc=val_acc
# 将最好的模型参数重新赋值给model
model.load_state_dict(best_model_wts)
    

In [None]:
# 使用evaluate函数对测试集进行预测
test_loss,test_acc=evaluate(model,test_iter,criterion)
print("在测试集上的预测精度为:",test_acc)
