In [1]:
import re
import random
import tarfile
import requests
import numpy as np
import paddle
from paddle.nn import Embedding
import paddle.nn.functional as F
from paddle.nn import LSTM, Embedding, Dropout, Linear

In [2]:
def download():
    # 通过python的requests类，下载存储在
    # https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz的文件
    corpus_url = "https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz"
    web_request = requests.get(corpus_url)
    corpus = web_request.content

    # 将下载的文件写在当前目录的aclImdb_v1.tar.gz文件内
    with open("./aclImdb_v1.tar.gz", "wb") as f:
        f.write(corpus)
    f.close()

download()

In [3]:
def load_imdb(is_training):
    data_set = []


    
    for label in ["pos", "neg"]:
        with tarfile.open("./aclImdb_v1.tar.gz") as tarf:
            path_pattern = "aclImdb/train/" + label + "/.*\.txt$" if is_training \
                else "aclImdb/test/" + label + "/.*\.txt$"
            path_pattern = re.compile(path_pattern)
            tf = tarf.next()
            while tf != None:
                if bool(path_pattern.match(tf.name)):
                    sentence = tarf.extractfile(tf).read().decode()
                    sentence_label = 0 if label == 'neg' else 1
                    data_set.append((sentence, sentence_label)) 
                tf = tarf.next()

    return data_set

train_corpus = load_imdb(True)
test_corpus = load_imdb(False)

for i in range(5):
    print("sentence %d, %s" % (i, train_corpus[i][0]))    
    print("sentence %d, label %d" % (i, train_corpus[i][1]))

sentence 0, Zentropa has much in common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn't really understand, and whose naivety is all the more striking in contrast with the natives.<br /><br />But I'd have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. <br /><br />This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.
sentence 0, label 1
sentence 1, Zentropa is the most original movie I've seen in years. If you like unique thrillers that are influenced by film noir, then this is just the right cure for all of those Hollywood summer blockbusters clogging the theaters these 

In [4]:
def data_preprocess(corpus):
    data_set = []
    for sentence, sentence_label in corpus:

        sentence = sentence.strip().lower()
        sentence = sentence.split(" ")
        
        data_set.append((sentence, sentence_label))

    return data_set

train_corpus = data_preprocess(train_corpus)
test_corpus = data_preprocess(test_corpus)
print(train_corpus[:5])
print(test_corpus[:5])

[(['zentropa', 'has', 'much', 'in', 'common', 'with', 'the', 'third', 'man,', 'another', 'noir-like', 'film', 'set', 'among', 'the', 'rubble', 'of', 'postwar', 'europe.', 'like', 'ttm,', 'there', 'is', 'much', 'inventive', 'camera', 'work.', 'there', 'is', 'an', 'innocent', 'american', 'who', 'gets', 'emotionally', 'involved', 'with', 'a', 'woman', 'he', "doesn't", 'really', 'understand,', 'and', 'whose', 'naivety', 'is', 'all', 'the', 'more', 'striking', 'in', 'contrast', 'with', 'the', 'natives.<br', '/><br', '/>but', "i'd", 'have', 'to', 'say', 'that', 'the', 'third', 'man', 'has', 'a', 'more', 'well-crafted', 'storyline.', 'zentropa', 'is', 'a', 'bit', 'disjointed', 'in', 'this', 'respect.', 'perhaps', 'this', 'is', 'intentional:', 'it', 'is', 'presented', 'as', 'a', 'dream/nightmare,', 'and', 'making', 'it', 'too', 'coherent', 'would', 'spoil', 'the', 'effect.', '<br', '/><br', '/>this', 'movie', 'is', 'unrelentingly', 'grim--"noir"', 'in', 'more', 'than', 'one', 'sense;', 'one', 

In [5]:
# 构造词典，统计每个词的频率，并根据频率将每个词转换为一个整数id
def build_dict(corpus):
    word_freq_dict = dict()
    for sentence, _ in corpus:
        for word in sentence:
            if word not in word_freq_dict:
                word_freq_dict[word] = 0
            word_freq_dict[word] += 1

    word_freq_dict = sorted(word_freq_dict.items(), key = lambda x:x[1], reverse = True)
    
    word2id_dict = dict()
    word2id_freq = dict()

    
    word2id_dict['[oov]'] = 0
    word2id_freq[0] = 1e10

    word2id_dict['[pad]'] = 1
    word2id_freq[1] = 1e10

    for word, freq in word_freq_dict:
        word2id_dict[word] = len(word2id_dict)
        word2id_freq[word2id_dict[word]] = freq

    return word2id_freq, word2id_dict

word2id_freq, word2id_dict = build_dict(train_corpus)
vocab_size = len(word2id_freq)
print("there are totoally %d different words in the corpus" % vocab_size)
for _, (word, word_id) in zip(range(10), word2id_dict.items()):
    print("word %s, its id %d, its word freq %d" % (word, word_id, word2id_freq[word_id]))

there are totoally 252173 different words in the corpus
word [oov], its id 0, its word freq 10000000000
word [pad], its id 1, its word freq 10000000000
word the, its id 2, its word freq 322174
word a, its id 3, its word freq 159949
word and, its id 4, its word freq 158556
word of, its id 5, its word freq 144459
word to, its id 6, its word freq 133965
word is, its id 7, its word freq 104170
word in, its id 8, its word freq 90521
word i, its id 9, its word freq 70477


In [6]:
# 把语料转换为id序列
def convert_corpus_to_id(corpus, word2id_dict):
    data_set = []
    for sentence, sentence_label in corpus:
        # 将句子中的词逐个替换成id，如果句子中的词不在词表内，则替换成oov
        # 这里需要注意，一般来说我们可能需要查看一下test-set中，句子oov的比例，
        # 如果存在过多oov的情况，那就说明我们的训练数据不足或者切分存在巨大偏差，需要调整
        sentence = [word2id_dict[word] if word in word2id_dict \
                    else word2id_dict['[oov]'] for word in sentence]    
        data_set.append((sentence, sentence_label))
    return data_set

train_corpus = convert_corpus_to_id(train_corpus, word2id_dict)
test_corpus = convert_corpus_to_id(test_corpus, word2id_dict)
print("%d tokens in the corpus" % len(train_corpus))
print(train_corpus[:5])
print(test_corpus[:5])

25000 tokens in the corpus
[([22216, 41, 76, 8, 1136, 17, 2, 874, 979, 167, 69425, 24, 283, 707, 2, 19881, 5, 16628, 11952, 37, 100421, 52, 7, 76, 5733, 415, 912, 52, 7, 32, 1426, 299, 36, 195, 2299, 644, 17, 3, 282, 27, 141, 61, 7447, 4, 555, 25364, 7, 35, 2, 51, 3590, 8, 2691, 17, 2, 69426, 13, 688, 428, 26, 6, 142, 11, 2, 874, 160, 41, 3, 51, 14841, 4458, 22216, 7, 3, 218, 6262, 8, 10, 6919, 382, 10, 7, 100422, 12, 7, 1394, 15, 3, 100423, 4, 242, 12, 104, 5041, 54, 2368, 2, 4828, 109, 13, 255, 20, 7, 32280, 100424, 8, 51, 68, 30, 29571, 30, 102, 1010, 2, 4142, 18952, 11069, 18, 11636, 4, 12644], 1), ([22216, 7, 2, 78, 225, 20, 190, 119, 8, 1043, 46, 25, 37, 1008, 4578, 11, 22, 4379, 31, 24, 9244, 96, 10, 7, 39, 2, 246, 5601, 16, 35, 5, 136, 385, 1901, 11953, 69427, 2, 3689, 124, 2351, 2666, 17339, 100425, 37, 2732, 2, 6821, 26, 1702, 51, 35630, 18, 10, 7, 61, 21, 116, 912, 12, 7, 7006, 191, 99, 6263, 4, 1485, 2, 439, 2239, 5, 1221, 4, 513, 2598, 44, 104, 97, 27, 761, 32281, 5417, 66

In [7]:
# 编写一个迭代器，每次调用这个迭代器都会返回一个新的batch，用于训练或者预测
def build_batch(word2id_dict, corpus, batch_size, epoch_num, max_seq_len, shuffle = True, drop_last = True):

    # 模型将会接受的两个输入：
    # 1. 一个形状为[batch_size, max_seq_len]的张量，sentence_batch，代表了一个mini-batch的句子。
    # 2. 一个形状为[batch_size, 1]的张量，sentence_label_batch，每个元素都是非0即1，代表了每个句子的情感类别（正向或者负向）
    sentence_batch = []
    sentence_label_batch = []

    for _ in range(epoch_num): 

        #每个epoch前都shuffle一下数据，有助于提高模型训练的效果
        #但是对于预测任务，不要做数据shuffle
        if shuffle:
            random.shuffle(corpus)

        for sentence, sentence_label in corpus:
            sentence_sample = sentence[:min(max_seq_len, len(sentence))]
            if len(sentence_sample) < max_seq_len:
                for _ in range(max_seq_len - len(sentence_sample)):
                    sentence_sample.append(word2id_dict['[pad]'])
            
            
            sentence_sample = [[word_id] for word_id in sentence_sample]

            sentence_batch.append(sentence_sample)
            sentence_label_batch.append([sentence_label])

            if len(sentence_batch) == batch_size:
                yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")
                sentence_batch = []
                sentence_label_batch = []
    if not drop_last and len(sentence_batch) > 0:
        yield np.array(sentence_batch).astype("int64"), np.array(sentence_label_batch).astype("int64")

for batch_id, batch in enumerate(build_batch(word2id_dict, train_corpus, batch_size=3, epoch_num=3, max_seq_len=30)):
    print(batch)
    break

(array([[[63923],
        [44729],
        [    2],
        [ 1171],
        [43012],
        [    7],
        [   30],
        [    5],
        [  136],
        [  400],
        [  170],
        [  123],
        [   43],
        [  412],
        [52426],
        [ 1011],
        [    3],
        [  320],
        [  707],
        [    2],
        [   88],
        [  123],
        [    5],
        [   83],
        [  840],
        [    4],
        [   55],
        [    2],
        [  627],
        [  234]],

       [[34913],
        [26050],
        [27037],
        [ 6440],
        [   57],
        [ 1931],
        [    3],
        [   24],
        [   11],
        [26629],
        [  800],
        [   19],
        [    2],
        [19162],
        [  667],
        [  187],
        [    2],
        [15944],
        [    4],
        [   21],
        [  629],
        [11625],
        [31786],
        [   16],
        [    3],
        [   75],
        [  189],
        [   15],
        [  

In [8]:
# 定义一个用于情感分类的网络实例，SentimentClassifier
class SentimentClassifier(paddle.nn.Layer):
    
    def __init__(self, hidden_size, vocab_size, embedding_size, class_num=2, num_steps=128, num_layers=1, init_scale=0.1, dropout_rate=None):
        
        # 参数含义如下：
        # 1.hidden_size，表示embedding-size，hidden和cell向量的维度
        # 2.vocab_size，模型可以考虑的词表大小
        # 3.embedding_size，表示词向量的维度
        # 4.class_num，情感类型个数，可以是2分类，也可以是多分类
        # 5.num_steps，表示这个情感分析模型最大可以考虑的句子长度
        # 6.num_layers，表示网络的层数
        # 7.dropout_rate，表示使用dropout过程中失活的神经元比例
        # 8.init_scale，表示网络内部的参数的初始化范围,长短时记忆网络内部用了很多Tanh，Sigmoid等激活函数，\
        # 这些函数对数值精度非常敏感，因此我们一般只使用比较小的初始化范围，以保证效果
        super(SentimentClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.class_num = class_num
        self.num_steps = num_steps
        self.num_layers = num_layers
        self.dropout_rate = dropout_rate
        self.init_scale = init_scale
       
        # 声明一个LSTM模型，用来把每个句子抽象成向量
        self.simple_lstm_rnn = paddle.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers)

        # 声明一个embedding层，用来把句子中的每个词转换为向量
        self.embedding = paddle.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, sparse=False, 
                                    weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(low=-init_scale, high=init_scale)))
        
        # 声明使用上述语义向量映射到具体情感类别时所需要使用的线性层
        self.cls_fc = paddle.nn.Linear(in_features=self.hidden_size, out_features=self.class_num, 
                             weight_attr=None, bias_attr=None)
        
        # 一般在获取单词的embedding后，会使用dropout层，防止过拟合，提升模型泛化能力
        self.dropout_layer = paddle.nn.Dropout(p=self.dropout_rate, mode='upscale_in_train')

    # forwad函数即为模型前向计算的函数，它有两个输入，分别为：
    # input为输入的训练文本，其shape为[batch_size, max_seq_len]
    # label训练文本对应的情感标签，其shape维[batch_size, 1]
    def forward(self, inputs):
        # 获取输入数据的batch_size
        batch_size = inputs.shape[0]

        # 默认使用1层的LSTM，首先我们需要定义LSTM的初始hidden和cell，这里我们使用0来初始化这个序列的记忆
        init_hidden_data = np.zeros(
            (self.num_layers, batch_size, self.hidden_size), dtype='float32')
        init_cell_data = np.zeros(
            (self.num_layers, batch_size, self.hidden_size), dtype='float32')

        # 将这些初始记忆转换为飞桨可计算的向量，并且设置stop_gradient=True，避免这些向量被更新，从而影响训练效果
        init_hidden = paddle.to_tensor(init_hidden_data)
        init_hidden.stop_gradient = True
        init_cell = paddle.to_tensor(init_cell_data)
        init_cell.stop_gradient = True

        # 对应以上第2步，将输入的句子的mini-batch转换为词向量表示，转换后输入数据shape为[batch_size, max_seq_len, embedding_size]
        x_emb = self.embedding(inputs)
        x_emb = paddle.reshape(x_emb, shape=[-1, self.num_steps, self.embedding_size])
        # 在获取的词向量后添加dropout层
        if self.dropout_rate is not None and self.dropout_rate > 0.0:
            x_emb = self.dropout_layer(x_emb)
        
        # 对应以上第3步，使用LSTM网络，把每个句子转换为语义向量
        # 返回的last_hidden即为最后一个时间步的输出，其shape为[self.num_layers, batch_size, hidden_size]
        rnn_out, (last_hidden, last_cell) = self.simple_lstm_rnn(x_emb, (init_hidden, init_cell))
        # 提取最后一层隐状态作为文本的语义向量，其shape为[batch_size, hidden_size]
        last_hidden = paddle.reshape(last_hidden[-1], shape=[-1, self.hidden_size])

        # 对应以上第4步，将每个句子的向量表示映射到具体的情感类别上, logits的维度为[batch_size, 2]
        logits = self.cls_fc(last_hidden)
        
        return logits

In [9]:
paddle.seed(0)
random.seed(0)
np.random.seed(0)

# 定义训练参数
epoch_num = 5
batch_size = 128

learning_rate = 0.0001
dropout_rate = 0.2
num_layers = 1
hidden_size = 256
embedding_size = 256
max_seq_len = 128
vocab_size = len(word2id_freq)

# 实例化模型
sentiment_classifier = SentimentClassifier(hidden_size, vocab_size, embedding_size,  num_steps=max_seq_len, num_layers=num_layers, dropout_rate=dropout_rate)

# 指定优化策略，更新模型参数
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, beta1=0.9, beta2=0.999, parameters= sentiment_classifier.parameters()) 

# 定义训练函数
# 记录训练过程中的损失变化情况，可用于后续画图查看训练情况
losses = []
steps = []

def train(model):
    # 开启模型训练模式
    model.train()
    
    # 建立训练数据生成器，每次迭代生成一个batch，每个batch包含训练文本和文本对应的情感标签
    train_loader = build_batch(word2id_dict, train_corpus, batch_size, epoch_num, max_seq_len)
    
    for step, (sentences, labels) in enumerate(train_loader):
        # 获取数据，并将张量转换为Tensor类型
        sentences = paddle.to_tensor(sentences)
        labels = paddle.to_tensor(labels)
        
        # 前向计算，将数据feed进模型，并得到预测的标签和损失
        logits = model(sentences)

        # 计算损失
        loss = F.cross_entropy(input=logits, label=labels, soft_label=False)
        loss = paddle.mean(loss)

        # 后向传播
        loss.backward()
        # 更新参数
        optimizer.step()
        # 清除梯度
        optimizer.clear_grad()

        if step % 100 == 0:
            # 记录当前步骤的loss变化情况
            losses.append(loss.numpy())
            steps.append(step)
            # 打印当前loss数值
            print("step %d, loss %.3f" % (step, loss.numpy()))

#训练模型
train(sentiment_classifier)

# 保存模型，包含两部分：模型参数和优化器参数
model_name = "sentiment_classifier"
# 保存训练好的模型参数
paddle.save(sentiment_classifier.state_dict(), "{}.pdparams".format(model_name))
# 保存优化器参数，方便后续模型继续训练
paddle.save(optimizer.state_dict(), "{}.pdopt".format(model_name))

step 0, loss 0.687
step 100, loss 0.687
step 200, loss 0.691
step 300, loss 0.698
step 400, loss 0.662
step 500, loss 0.481
step 600, loss 0.261
step 700, loss 0.271
step 800, loss 0.167
step 900, loss 0.093


In [10]:
def evaluate(model):
    # 开启模型测试模式，在该模式下，网络不会进行梯度更新
    model.eval()

    # 定义以上几个统计指标
    tp, tn, fp, fn = 0, 0, 0, 0

    # 构造测试数据生成器
    test_loader = build_batch(word2id_dict, test_corpus, batch_size, 1, max_seq_len)
    
    for sentences, labels in test_loader:
        # 将张量转换为Tensor类型
        sentences = paddle.to_tensor(sentences)
        labels = paddle.to_tensor(labels)
        
        # 获取模型对当前batch的输出结果
        logits = model(sentences)
        
        # 使用softmax进行归一化
        probs = F.softmax(logits)

        # 把输出结果转换为numpy array数组，比较预测结果和对应label之间的关系，并更新tp，tn，fp和fn
        probs = probs.numpy()
        for i in range(len(probs)):
            # 当样本是的真实标签是正例
            if labels[i][0] == 1:
                # 模型预测是正例
                if probs[i][1] > probs[i][0]:
                    tp += 1
                # 模型预测是负例
                else:
                    fn += 1
            # 当样本的真实标签是负例
            else:
                # 模型预测是正例
                if probs[i][1] > probs[i][0]:
                    fp += 1
                # 模型预测是负例
                else:
                    tn += 1

    # 整体准确率
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    # 输出最终评估的模型效果
    print("TP: {}\nFP: {}\nTN: {}\nFN: {}\n".format(tp, fp, tn, fn))
    print("Accuracy: %.4f" % accuracy)

# 加载训练好的模型进行预测，重新实例化一个模型，然后将训练好的模型参数加载到新模型里面
saved_state = paddle.load("./sentiment_classifier.pdparams")
sentiment_classifier = SentimentClassifier(hidden_size, vocab_size, embedding_size,  num_steps=max_seq_len, num_layers=num_layers, dropout_rate=dropout_rate)
sentiment_classifier.load_dict(saved_state)

# 评估模型
evaluate(sentiment_classifier)

TP: 10337
FP: 2343
TN: 10137
FN: 2143

Accuracy: 0.8203


In [17]:
import torch
from torch.utils.data import DataLoader, Dataset
import os
import re
import numpy as np
import pickle
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
 
data_base_path = r'aclImdb'
 
train_batch_size = 64
test_batch_size = 500
max_len = 50

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
# 分词的API
def tokenize(text):
    # fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
                '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”',
                '“', ]
    text = re.sub("<.*?>", " ", text, flags=re.S)
    text = re.sub("|".join(fileters), " ", text, flags=re.S)
    return [i.strip() for i in text.split()]
 
 
# 自定义的数据集
class ImdbDataset(Dataset):
    def __init__(self, mode):
        super(ImdbDataset, self).__init__()
        if mode == "train":
            text_path = [os.path.join(data_base_path, i) for i in ["train/neg", "train/pos"]]
        else:
            text_path = [os.path.join(data_base_path, i) for i in ["test/neg", "test/pos"]]
 
        self.total_file_path_list = []
        for i in text_path:
            self.total_file_path_list.extend([os.path.join(i, j) for j in os.listdir(i)])
        # print(self.total_file_path_list)
 
    def __getitem__(self, idx):
        cur_path = self.total_file_path_list[idx]
        cur_filename = os.path.basename(cur_path)
        label = int(cur_filename.split("_")[-1].split(".")[0]) - 1
        text = tokenize(open(cur_path, encoding="utf-8").read().strip())
        return label, text
 
    def __len__(self):
        return len(self.total_file_path_list)
 
 
# 自定义的collate_fn方法
def collate_fn(batch):
    batch = list(zip(*batch))
    labels = torch.tensor(batch[0], dtype=torch.int32)
    texts = batch[1]
    texts = torch.tensor([ws.transform(i, max_len) for i in texts])
    del batch
    return labels.long(), texts.long()
 
 
# 获取数据的方法
def get_dataloader(train=True):
    if train:
        mode = 'train'
    else:
        mode = "test"
    dataset = ImdbDataset(mode)
    batch_size = train_batch_size if train else test_batch_size
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
 
 
# Word2Sequence
class Word2Sequence:
    UNK_TAG = "UNK"
    PAD_TAG = "PAD"
    UNK = 0
    PAD = 1
 
    def __init__(self):
        self.dict = {
            self.UNK_TAG: self.UNK,
            self.PAD_TAG: self.PAD
        }
        self.fited = False
        self.count = {}
 
    def to_index(self, word):
        return self.dict.get(word, self.UNK)
 
    def to_word(self, index):
        if index in self.inversed_dict:
            return self.inversed_dict[index]
        return self.UNK_TAG
 
    def __len__(self):
        return len(self.dict)
 
    def fit(self, sentence):
        for word in sentence:
            self.count[word] = self.count.get(word, 0) + 1
 
    def build_vocab(self, min_count=None, max_count=None, max_feature=None):
        if min_count is not None:
            self.count = {word: count for word, count in self.count.items() if count >= min_count}
 
        if max_count is not None:
            self.count = {word: count for word, count in self.count.items() if count <= max_count}
 
        if max_feature is not None:
            self.count = dict(sorted(self.count.items(), lambda x: x[-1], reverse=True)[:max_feature])
 
        for word in self.count:
            self.dict[word] = len(self.dict)
 
        self.inversed_dict = dict(zip(self.dict.values(), self.dict.keys()))
 
    def transform(self, sentence, max_len=None):
        if max_len is not None:
            r = [self.PAD] * max_len
        else:
            r = [self.PAD] * len(sentence)
        if max_len is not None and len(sentence) > max_len:
            sentence = sentence[:max_len]
        for index, word in enumerate(sentence):
            r[index] = self.to_index(word)
        return np.array(r, dtype=np.int64)
 
    def inverse_transform(self, indices):
        sentence = []
        for i in indices:
            word = self.to_word(i)
            sentence.append(word)
        return sentence
 
 
# 建立词表
def fit_save_word_sequence():
    word_to_sequence = Word2Sequence()
    train_path = [os.path.join(data_base_path, i) for i in ["train/neg", "train/pos"]]
    total_file_path_list = []
    for i in train_path:
        total_file_path_list.extend([os.path.join(i, j) for j in os.listdir(i)])
    for cur_path in tqdm(total_file_path_list, ascii=True, desc="fitting"):
        word_to_sequence.fit(tokenize(open(cur_path, encoding="utf-8").read().strip()))
    word_to_sequence.build_vocab()
    pickle.dump(word_to_sequence, open("model/ws.pkl", "wb"))
 
 
ws = pickle.load(open("./model/ws.pkl", "rb"))




In [18]:
fit_save_word_sequence()

fitting: 100%|#################################################################| 25000/25000 [00:11<00:00, 2157.61it/s]


In [19]:

class IMDBModel(nn.Module):
    def __init__(self):
        super(IMDBModel, self).__init__()
        self.hidden_size = 64
        self.embedding_dim = 200
        self.num_layer = 2
        self.bidirectional = True
        self.bi_num = 2 if self.bidirectional else 1
        self.dropout = 0.5
        # 以上部分为超参数，可以自行修改
        self.embedding = nn.Embedding(len(ws), self.embedding_dim, padding_idx=ws.PAD)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_size,
                            self.num_layer, bidirectional=True, dropout=self.dropout)
        self.fc = nn.Linear(self.hidden_size * self.bi_num, 20)
        self.fc2 = nn.Linear(20, 10)
 
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)  # 进行轴交换
        h_0, c_0 = self.init_hidden_state(x.size(1))
        h_0 = h_0.to(x.device)  # 确保初始隐藏状态与输入数据在同一设备上
        c_0 = c_0.to(x.device)  # 确保初始隐藏状态与输入数据在同一设备上
        _, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        # 只要最后一个lstm单元处理的结果，取前向LSTM和后向LSTM的结果进行简单拼接
        out = torch.cat([h_n[-2, :, :], h_n[-1, :, :]], dim=-1)
        out = self.fc(out)
        out = F.relu(out)
        out = self.fc2(out)
        return F.log_softmax(out, dim=-1)
 
    def init_hidden_state(self, batch_size):
        h_0 = torch.rand(self.num_layer * self.bi_num, batch_size, self.hidden_size).to(device)
        c_0 = torch.rand(self.num_layer * self.bi_num, batch_size, self.hidden_size).to(device)
        return h_0, c_0

 
 
imdb_model = IMDBModel()
optimizer = optim.Adam(imdb_model.parameters())
criterion = nn.CrossEntropyLoss()


In [20]:

def train(epoch):
    mode = True
    train_dataloader = get_dataloader(mode)
    for idx, (target, input) in enumerate(train_dataloader):
        optimizer.zero_grad()
        output = imdb_model(input)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, idx * len(input), len(train_dataloader.dataset),
                       100. * idx / len(train_dataloader), loss.item()))
            torch.save(imdb_model.state_dict(), "model/mnist_net_lstm.pkl")
            torch.save(optimizer.state_dict(), 'model/mnist_optimizer_lstm.pkl')

In [21]:

def test():
    test_loss = 0
    correct = 0
    mode = False
    imdb_model.eval()
    test_dataloader = get_dataloader(mode)
    with torch.no_grad():
        for target, input in test_dataloader:
            output = imdb_model(input)
            test_loss += F.nll_loss(output, target, reduction="sum")
            pred = torch.max(output, dim=-1, keepdim=False)[-1]
            correct += pred.eq(target.data).sum()
        test_loss = test_loss / len(test_dataloader.dataset)
        print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
            test_loss, correct, len(test_dataloader.dataset),
            100. * correct / len(test_dataloader.dataset)))


In [22]:

if __name__ == '__main__':
    # # 测试数据集的功能

 
    # 训练和测试
    test()
    for i in range(3):
        train(i)
        print(
            "训练第{}轮的测试结果-----------------------------------------------------------------------------------------".format(
                i + 1))
        test()



Test set: Avg. loss: 2.3230, Accuracy: 2363/25000 (9.45%)

训练第1轮的测试结果-----------------------------------------------------------------------------------------

Test set: Avg. loss: 1.9725, Accuracy: 5825/25000 (23.30%)

训练第2轮的测试结果-----------------------------------------------------------------------------------------

Test set: Avg. loss: 1.8695, Accuracy: 7634/25000 (30.54%)

训练第3轮的测试结果-----------------------------------------------------------------------------------------

Test set: Avg. loss: 1.8339, Accuracy: 7846/25000 (31.38%)



In [1]:
import senteval
import numpy as np
import pickle
import os
from six.moves import cPickle
import torch
from sentence_transformers import SentenceTransformer, models

  from .autonotebook import tqdm as notebook_tqdm


In [6]:


# 检查模型是否能够成功加载
try:
    model = torch.load('model/mnist_net_lstm.pkl')
    print("模型加载成功！")
except Exception as e:
    print("模型加载失败:", e)


模型加载成功！


In [17]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

In [21]:
# 加载STSB数据集
train_df = pd.read_csv('C:/Users/78658/SentEval-main/data/downstream/STS/STSBenchmark/sts-train.csv', 
                       delimiter=',', names=['column1', 'column2', ...])  # 替换'column1', 'column2'为实际的列名
test_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-test.csv')
dev_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-dev.csv')


ParserError: Error tokenizing data. C error: Expected 3 fields in line 1235, saw 4


In [22]:
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader

# 加载dev、test和train数据集
dev_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-dev.csv')
test_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-test.csv')
train_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-train.csv')

# 使用BertTokenizer对文本进行编码
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_text(df, tokenizer):
    encodings = tokenizer(df['sentence1'].tolist(), df['sentence2'].tolist(), padding=True, truncation=True, return_tensors='pt')
    labels = torch.tensor(df['similarity_score'].tolist())
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)

    return dataset

# 对dev、test和train数据集进行编码
dev_dataset = encode_text(dev_df, tokenizer)
test_dataset = encode_text(test_df, tokenizer)
train_dataset = encode_text(train_df, tokenizer)

# 定义批处理大小并创建数据加载器
batch_size = 32
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 现在可以使用dev_loader、test_loader和train_loader对模型进行训练和评估


ParserError: Error tokenizing data. C error: Expected 1 fields in line 104, saw 2


In [29]:
import pandas as pd
import re 

# 加载dev、test和train数据集
dev_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-dev.csv', delimiter=',')
test_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-test.csv', delimiter=',')
train_df = pd.read_csv(r'C:\Users\78658\SentEval-main\data\downstream\STS\STSBenchmark\sts-train.csv', delimiter=',')

# 示例：使用正则表达式删除换行符
# 修正为对加载的具体数据集进行操作
dev_df['column_name'] = dev_df['column_name'].apply(lambda x: re.sub(r'\n', '', str(x)))

# 数据清理的示例步骤
def tokenize(text):
    # 定义需要过滤的字符列表
    filters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
                '\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”',
                '“']

    # 使用正则表达式进行文本预处理
    text = re.sub("<.*?>", " ", str(text), flags=re.S)
    text = re.sub("|".join(filters), " ", str(text), flags=re.S)
    return [i.strip() for i in str(text).split()]

# 数据清理的示例步骤
def clean_dataset(df):
    # 处理缺失值
    df.dropna(inplace=True)
    
    # 处理重复值
    df.drop_duplicates(inplace=True)

    # 示例: 对特定列进行文本预处理
    df['text_column'] = df['text_column'].apply(tokenize)
  
    # 进行其他清理操作

    return df

# 对每个数据集文件进行数据清理
cleaned_dev_df = clean_dataset(dev_df)
cleaned_test_df = clean_dataset(test_df)
cleaned_train_df = clean_dataset(train_df)

# 保存处理后的数据集到新文件
cleaned_dev_df.to_csv('cleaned_dev_dataset.csv', index=False)
cleaned_test_df.to_csv('cleaned_test_dataset.csv', index=False)
cleaned_train_df.to_csv('cleaned_train_dataset.csv', index=False)


ParserError: Error tokenizing data. C error: Expected 1 fields in line 104, saw 2
