In [1]:
#导入MindSpore中的nn模块
import mindspore.nn as nn
#导入MindSpore中的ops模块
import mindspore.ops as ops
#导入MindSpore中ops模块的operations类
from mindspore.ops import operations as P
#导入MindSpore中的Model
from mindspore.train import Model
#配置当前执行环境
from mindspore import context

In [2]:
#使用EasyDict库创建LSTM的配置文件
from easydict import EasyDict as edict
# LSTM CONFIG LSTM的配置项
lstm_cfg = edict({
    'num_classes': 2,
    'learning_rate': 0.1,
    'momentum': 0.9,
    'num_epochs': 20,    #在训练时可以缩小该数字，以缩短训练时间
    'batch_size': 64,
    'embed_size': 500,
    'num_hiddens': 100,
    'num_layers': 2,
    'bidirectional': True,
    'save_checkpoint_steps': 390,
    'keep_checkpoint_max': 10,
    'vocab_size':6
})
#训练参数
args_train = edict({
    'preprocess': 'true',
    'aclimdb_path': "./aclImdb",
    'glove_path': "./glove",
    'preprocess_path': "C:/Users/Administrator/Desktop/28-基于MindSpore实现LSTM算法/preprocessed",
    'ckpt_path': "./",
    'pre_trained': None,
    'device_target': "CPU",
})
#测试参数
args_test = edict({
    'preprocess': 'false',
    'aclimdb_path': "./aclImdb",
    'glove_path': "./glove",
    #'preprocess_path': "./preprocess",
    'preprocess_path': "C:/Users/Administrator/Desktop/28-基于MindSpore实现LSTM算法/preprocessed",
    'ckpt_path': "./lstm-20_390.ckpt",
    'pre_trained': None,
    'device_target':  "CPU",
})

In [3]:
#imdb数据解析
import os
from itertools import chain
import numpy as np
import gensim
imdb_path = r"C:\Users\Administrator\Desktop\28-基于MindSpore实现LSTM算法\aclImdb"
glove_path = r"C:\Users\Administrator\Desktop\28-基于MindSpore实现LSTM算法\glove"
preprocess_path = r"C:\Users\Administrator\Desktop\28-基于MindSpore实现LSTM算法\preprocessed"
class ImdbParser():
    def __init__(self, imdb_path, glove_path, embed_size=300):
        self.__segs = ['train', 'test']
        self.__label_dic = {'pos': 1, 'neg': 0}
        self.__imdb_path = imdb_path
        self.__glove_dim = embed_size
        self.__glove_file = os.path.join(glove_path, 'glove.6B.' + str(self.__glove_dim) + 'd.txt')
# 定义属性值
        self.__imdb_datas = {}
        self.__features = {}
        self.__labels = {}
        self.__vacab = {}
        self.__word2idx = {}
        self.__weight_np = {}
        self.__wvmodel = None
    def parse(self):
#解析imdb数据到内存
        self.__wvmodel = gensim.models.KeyedVectors.load_word2vec_format(self.__glove_file)
        for seg in self.__segs:
            self.__parse_imdb_datas(seg)
            self.__parse_features_and_labels(seg)
            self.__gen_weight_np(seg)
    def __parse_imdb_datas(self, seg):
#从txt中加载数据
        data_lists = []
        for label_name, label_id in self.__label_dic.items():
            sentence_dir = os.path.join(self.__imdb_path, seg, label_name)
            for file in os.listdir(sentence_dir):
                with open(os.path.join(sentence_dir, file), mode='r', encoding='utf8') as f:
                    sentence = f.read().replace('\n', '')
                    data_lists.append([sentence, label_id])
        self.__imdb_datas[seg] = data_lists
    def __parse_features_and_labels(self, seg):
#解析特征与标签
        features = []
        labels = []
        for sentence, label in self.__imdb_datas[seg]:
            features.append(sentence)
            labels.append(label)
        self.__features[seg] = features
        self.__labels[seg] = labels
#更新特征到标记
        self.__updata_features_to_tokenized(seg)
# 解析vacab
        self.__parse_vacab(seg)
#编码特征
        self.__encode_features(seg)
#填充特征
        self.__padding_features(seg)
    def __updata_features_to_tokenized(self, seg):
# 创建一个空列表，用于存储分词后的句子
        tokenized_features = []
# 遍历指定部分的特征中的每个句子
        for sentence in self.__features[seg]:
# 将句子拆分为单词列表，并将每个单词转换为小写
            tokenized_sentence = [word.lower() for word in sentence.split(" ")]
# 将分词后的句子添加到 tokenized_features 列表中
            tokenized_features.append(tokenized_sentence)
# 将分词后的特征更新为原始特征
        self.__features[seg] = tokenized_features
    def __parse_vacab(self, seg):
# 获取分词后的特征
        tokenized_features = self.__features[seg]
# 将分词后的特征压平为单个词汇集
        vocab = set(chain(*tokenized_features))
# 将词汇集存储在 __vacab 属性中
        self.__vacab[seg] = vocab
# 创建一个词汇表字典，将每个单词映射到它在词汇集中的索引值加一
        word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
# 将特殊单词 <unk> 映射到索引值 0，以避免在解析过程中遇到未知的单词
        word_to_idx['<unk>'] = 0
# 将词汇表字典存储在 __word2idx 属性中
        self.__word2idx[seg] = word_to_idx
    def __encode_features(self, seg):
# 获取训练集上的词汇表字典，用于将单词编码为索引
        word_to_idx = self.__word2idx['train']
        encoded_features = []
# 遍历分词后的特征列表
        for tokenized_sentence in self.__features[seg]:
            encoded_sentence = []
# 遍历每个分词后的句子中的单词
            for word in tokenized_sentence:
# 将单词编码为索引，如果单词不在词汇表中，将其编码为索引值为 0，即 '<unk>' 的索引值
                encoded_sentence.append(word_to_idx.get(word, 0))
# 将编码后的句子添加到编码特征列表中
            encoded_features.append(encoded_sentence)
# 更新分词后的特征列表为编码特征列表
        self.__features[seg] = encoded_features
    def __padding_features(self, seg, maxlen=500, pad=0):
        padded_features = []
# 遍历特征列表中的每个特征
        for feature in self.__features[seg]:
# 如果当前特征长度大于等于 maxlen，则只取前 maxlen 个元素
            if len(feature) >= maxlen:
                padded_feature = feature[:maxlen]
# 否则，在当前特征后面添加 pad 直到特征长度达到 maxlen
            else:
                padded_feature = feature
                while len(padded_feature) < maxlen:
                    padded_feature.append(pad)
# 将填充后的特征添加到填充特征列表中
            padded_features.append(padded_feature)
# 更新分词后的特征列表为填充特征列表
        self.__features[seg] = padded_features
    def __gen_weight_np(self, seg):
        weight_np = np.zeros((len(self.__word2idx[seg]), self.__glove_dim), dtype=np.float32)
        for word, idx in self.__word2idx[seg].items():
            if word not in self.__wvmodel:
                continue
                word_vector = self.__wvmodel.get_vector(word)
                weight_np[idx, :] = word_vector
        weight_file = os.path.join(self.__imdb_path, seg, 'C:/Users/Administrator/Desktop/28-基于MindSpore实现LSTM算法/preprocessed/weight.npy')
        np.save(weight_file, weight_np)
        print("Saved weight file:", weight_file) 
        self.__weight_np[seg] = weight_np
    def get_datas(self, seg):
#返回特征、标签和权重值
        features = np.array(self.__features[seg]).astype(np.int32)
        labels = np.array(self.__labels[seg]).astype(np.int32)
        weight = np.array(self.__weight_np[seg])
        return features, labels, weight
if __name__ == "__main__":
    # 创建 ImdbParser 对象
    parser = ImdbParser(imdb_path, glove_path)

    # 解析数据
    parser.parse()
    # 获取训练集数据
    train_features, train_labels, train_weights = parser.get_datas('train')
    print("Train Features:", train_features)
    print("Train Labels:", train_labels)
    print("Train Weights:", train_weights)

    # 获取测试集数据
    test_features, test_labels, test_weights = parser.get_datas('test')
    print("Test Features:", test_features)
    print("Test Labels:", test_labels)
    print("Test Weights:", test_weights)

Saved weight file: C:/Users/Administrator/Desktop/28-基于MindSpore实现LSTM算法/preprocessed/weight.npy
Saved weight file: C:/Users/Administrator/Desktop/28-基于MindSpore实现LSTM算法/preprocessed/weight.npy
Train Features: [[   843 101308 190657 ...      0      0      0]
 [137928  18339  17857 ...      0      0      0]
 [186950  32579  74349 ...      0      0      0]
 ...
 [ 83743 147545  75199 ...      0      0      0]
 [121126 232050  57228 ...      0      0      0]
 [251249 190657 213461 ...      0      0      0]]
Train Labels: [1 1 1 ... 0 0 0]
Train Weights: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Test Features: [[ 83743 144724  23492 ...      0      0      0]
 [ 80919 179673 218301 ...      0      0      0]
 [ 48615 128921 103689 ...      0      0      0]
 ...
 [160358  36325 107934 ...      0      0      0]
 [ 33094 205948 224495 ...      0      0      0]
 [ 15771      0 124843

In [None]:
import mindspore
import mindspore.dataset as ds
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.train.model import Model
from mindspore.nn.metrics import Accuracy
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore import context

# 设置运行模式
context.set_context(device_target=args_train.device_target)

# 定义 LSTM 模型
class LSTMModel(nn.Cell):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=128, num_layers=2, has_bias=True, bidirectional=False)
        self.dense = nn.Dense(128, 1)

    def construct(self, x):
        output, _ = self.lstm(x)
        output = output[:, -1, :]
        output = self.dense(output)
        return output

# 创建 LSTM 模型实例
lstm_model = LSTMModel()

# 定义损失函数和优化器
loss_fn = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
optimizer = nn.Adam(params=lstm_model.trainable_params(), learning_rate=0.01)

# 加载数据
train_features = Tensor(train_features, dtype=mindspore.float32)
train_labels = Tensor(train_labels, dtype=mindspore.int32)
train_weights = Tensor(train_weights, dtype=mindspore.float32)
#data=(train_features, train_labels, train_weights)
# 创建数据集
#train_dataset = ds.GeneratorDataset(source=data,column_names=["data", "label","weights"])
#train_dataset = ds.NumpySlicesDataset([train_features, train_labels, train_weights], column_names=['features', 'labels', 'weights'])
data = [(train_features, train_labels, train_weights)]
train_dataset = ds.NumpySlicesDataset(data)
# 设置保存检查点的配置
config_ck = CheckpointConfig(save_checkpoint_steps=1000, keep_checkpoint_max=10)
ckpoint_cb = ModelCheckpoint(prefix='lstm', directory='./checkpoints', config=config_ck)

# 创建训练模型
model = Model(lstm_model, loss_fn, optimizer, metrics={'accuracy': Accuracy()})

# 训练模型
model.train(epoch=10, train_dataset=train_dataset, callbacks=[ckpoint_cb, LossMonitor()])

# 保存最终模型
model.save_checkpoint('./checkpoints/lstm_final.ckpt')


In [None]:
import mindspore.nn as nn
from mindspore import Tensor

# 加载测试数据
test_features = load_test_features()  # 加载测试特征数据
test_labels = load_test_labels()  # 加载测试标签数据

# 将数据转换为张量
test_features = Tensor(test_features, dtype=mindspore.float32)

# 加载模型
model = LSTMModel()  # 创建LSTM模型实例
model.load_checkpoint("model.ckpt")  # 加载模型参数

# 进行模型推理
output = model(test_features)
predicted_labels = np.argmax(output.asnumpy(), axis=1)  # 获取预测标签

# 计算准确率
accuracy = np.mean(predicted_labels == test_labels)

print("Test Accuracy: {:.2f}%".format(accuracy * 100))
