# 基于深度学习的文本分类

使用基于mindspore框架的自然语言处理库mindnlp

主仓库地址：https://github.com/mindspore-lab/mindnlp

# 构造数据集

根据所提供的news20数据集，

In [1]:
"""
news20 load function
"""
import os
import mindspore
from tqdm import tqdm
from mindspore.dataset import GeneratorDataset

mindspore.set_context(device_target="GPU") # set GPU

label_dict = {}
label_count = 0

class News20:
    """
    NEWS dataset source
    """

    def __init__(self, path)->None:
        self.path: str = path
        self._text, self._label = [], []
        self.label_dict = {}
        self.label_count = 0
        self._load()

    def _load(self):
        labels = os.listdir(self.path)
        for label in labels:
            self.label_dict[label] = self.label_count
            self.label_count += 1
            label_path = os.path.join(self.path, label)
            with tqdm(total = len(os.listdir(label_path)), desc = label) as pbar:
                for t in os.listdir(label_path):
                    pbar.update(1)
                    text_path = os.path.join(label_path, t)
                    file = open(text_path, "r" ,encoding="latin-1") # latin-1
                    text = file.read().strip()
                    begin = text.find('\n\n')
                    if 0 < begin: # skip head
                        text = text[begin:]
                    self._text.append(text)
                    self._label.append(self.label_dict[label])

    def __getitem__(self, index):
        return self._text[index], self._label[index]

    def __len__(self):
        return len(self._label)

def news20(source:News20):
    r"""
    Load the news20 dataset
    """
    column_names = ["text", "label"]
    return GeneratorDataset(source = source, column_names = column_names, shuffle = False) # 不shuffle，划分数据集时会进行打乱


In [2]:
# 载入数据
dataset_source = News20(r"/run/determined/workdir/qishunheng/lijiaming/nlp_ex/dataset/20_newsgroup")
print(dataset_source.label_dict)
dataset = news20(dataset_source)

sci.electronics: 100%|██████████| 1000/1000 [00:02<00:00, 480.48it/s]
misc.forsale: 100%|██████████| 1000/1000 [00:01<00:00, 553.81it/s]
rec.sport.hockey: 100%|██████████| 1000/1000 [00:01<00:00, 502.76it/s]
rec.autos: 100%|██████████| 1000/1000 [00:02<00:00, 492.54it/s]
alt.atheism: 100%|██████████| 1000/1000 [00:02<00:00, 465.95it/s]
comp.sys.mac.hardware: 100%|██████████| 1000/1000 [00:01<00:00, 507.54it/s]
talk.politics.mideast: 100%|██████████| 1000/1000 [00:01<00:00, 513.09it/s]
sci.med: 100%|██████████| 1000/1000 [00:01<00:00, 556.65it/s]
rec.motorcycles: 100%|██████████| 1000/1000 [00:02<00:00, 428.16it/s]
soc.religion.christian: 100%|██████████| 997/997 [00:01<00:00, 546.28it/s]
talk.politics.guns: 100%|██████████| 1000/1000 [00:02<00:00, 447.40it/s]
sci.crypt: 100%|██████████| 1000/1000 [00:02<00:00, 493.32it/s]
rec.sport.baseball: 100%|██████████| 1000/1000 [00:01<00:00, 516.73it/s]
comp.windows.x: 100%|██████████| 1000/1000 [00:01<00:00, 546.93it/s]
talk.religion.misc: 100%

{'sci.electronics': 0, 'misc.forsale': 1, 'rec.sport.hockey': 2, 'rec.autos': 3, 'alt.atheism': 4, 'comp.sys.mac.hardware': 5, 'talk.politics.mideast': 6, 'sci.med': 7, 'rec.motorcycles': 8, 'soc.religion.christian': 9, 'talk.politics.guns': 10, 'sci.crypt': 11, 'rec.sport.baseball': 12, 'comp.windows.x': 13, 'talk.religion.misc': 14, 'sci.space': 15, 'comp.os.ms-windows.misc': 16, 'comp.graphics': 17, 'talk.politics.misc': 18, 'comp.sys.ibm.pc.hardware': 19}





In [3]:
# define Models & Loss & Optimizer
hidden_size = 256
output_size = 20
num_layers = 2
bidirectional = False
drop = 0.5
lr = 0.001

In [4]:
# 数据预处理
import mindspore
from mindspore.dataset import text, transforms
from mindnlp.modules import Glove
from mindnlp.dataset.transforms import BasicTokenizer
from mindnlp.dataset import process

def data_process(dataset, tokenizer, vocab, batch_size = 64, max_len = 2000, drop_remainder = False):
    """
    预处理
    """
    # print("!!!!!!!!!!!!!!!!!!!!!!!")
    dataset = dataset.map([tokenizer], 'text')
    lookup_op = text.Lookup(vocab, unknown_token = '<unk>')
    dataset = dataset.map([lookup_op], 'text')
    pad_value = vocab.tokens_to_ids('<pad>')
    pad_op = transforms.PadEnd([max_len], pad_value)
    dataset = dataset.map([pad_op], 'text')
    onehot_op = transforms.OneHot(num_classes=20)
    dataset = dataset.map([onehot_op], 'label')
    type_cast_op = transforms.TypeCast(mindspore.float32)
    dataset = dataset.map([type_cast_op], 'label')

    dataset = dataset.batch(batch_size, drop_remainder = drop_remainder)
    return dataset

tokenizer = BasicTokenizer(True)
embedding, vocab = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"], dropout=drop, root = r"/run/determined/workdir/qishunheng/lijiaming")

dataset_process = data_process(dataset, tokenizer, vocab) # 其余使用默认参数

In [5]:
# 划分数据集
train_dataset, test_dataset = dataset_process.split([0.8, 0.2])

In [32]:
# 构建网络
import math

from mindspore import nn
from mindspore import ops
from mindspore.common.initializer import Uniform, HeUniform
from mindnlp.abc import Seq2vecModel

# class Head(nn.Cell):
#     """
#     Head for Classification model
#     """
#     def __init__(self, hidden_dim, output_dim, dropout):
#         super().__init__()
#         weight_init = HeUniform(math.sqrt(5))
#         bias_init = Uniform(10 / math.sqrt(hidden_dim * 2))
#         self.fc = nn.Dense(hidden_dim * 2, output_dim, weight_init=weight_init, bias_init=bias_init)
#         self.softmax = nn.Softmax()
#         self.dropout = nn.Dropout(1 - dropout)

#     def construct(self, context):
#         context = ops.concat((context[-2, :, :], context[-1, :, :]), axis=1)
#         context = self.dropout(context)
#         return self.softmax(self.fc(context))


# class Classification(Seq2vecModel):
#     """
#     Classification model
#     """
#     def __init__(self, encoder, head):
#         super().__init__(encoder, head)
#         self.encoder = encoder
#         self.head = head

#     def construct(self, text):
#         _, (hidden, _), _ = self.encoder(text)
#         output = self.head(hidden)
#         return output
from mindspore.ops import operations as P
class Classification(nn.Cell):
    def __init__(self) -> None:
        super(Classification, self).__init__()
        self.embedding = embedding
        self.lstm = nn.LSTM(100, hidden_size, num_layers=num_layers, batch_first=True, dropout=drop)
        self.dense1 = nn.Dense(hidden_size, 64)
        self.relu = nn.ReLU()
        self.dense2 = nn.Dense(64, 20)
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(0.2)
    def construct(self, text):
        text = self.embedding(text) # 64,2000,100
        print(text.shape)
        text, _ =self.lstm(text) # 64,256
        print(text.shape)
        text = self.dense1(text) # 64,64
        print(text.shape)
        text = self.relu(text) # 64,64
        print(text.shape)
        text = self.dropout(text) # 64,64
        print(text.shape)
        text = self.dense2(text) # 64,20
        print(text.shape)
        text = self.softmax(text) # 64,20
        print(text.shape)
        return text


In [33]:
t = Classification()
d = next(train_dataset.create_dict_iterator())
l = d['label']
d = d['text']
print(d.shape)
d = t.construct(d)
print(d.shape)

(64, 2000)
(64, 2000, 100)
(64, 2000, 256)
(64, 2000, 64)
(64, 2000, 64)
(64, 2000, 64)
(64, 2000, 20)
(64, 2000, 20)
(64, 2000, 20)


In [34]:
print(l.shape)
print(d.shape)

(64, 20)
(64, 2000, 20)


In [35]:
from mindnlp.modules import RNNEncoder
from mindnlp.engine.metrics import Accuracy
from mindnlp.engine.trainer import Trainer

# lstm_layer = nn.LSTM(100, hidden_size, num_layers=num_layers, batch_first=True,
#                      dropout=drop, bidirectional=bidirectional)
# encoder = RNNEncoder(embedding, lstm_layer)
# head = Head(hidden_size, output_size, drop)

# net = Classification(encoder, head)
net = Classification()
loss = nn.CrossEntropyLoss()
optimizer = nn.Adam(net.trainable_params(), learning_rate=lr)

# define metrics
metric = Accuracy()

# define trainer
trainer = Trainer(network=net, train_dataset = train_dataset, eval_dataset = test_dataset, metrics=metric,
                  epochs=5, loss_fn=loss, optimizer=optimizer)
trainer.run(tgt_columns="label", jit=True)
print("end train")

[ERROR] ANALYZER(82727,7fb39c805740,python):2022-12-17-03:26:46.807.736 [mindspore/ccsrc/pipeline/jit/static_analysis/async_eval_result.cc:66] HandleException] Exception happened, check the information as below.

The function call stack (See file '/run/determined/workdir/qishunheng/lijiaming/nlp_ex/rank_0/om/analyze_fail.dat' for more details. Get instructions about `analyze_fail.dat` at https://www.mindspore.cn/search?inputValue=analyze_fail.dat):
# 0 In file /run/determined/workdir/qishunheng/lijiaming/nlp_ex/mindnlp/engine/trainer.py:188
            (loss, _), grads = grad_fn(inputs, labels)
                               ^
# 1 In file /opt/conda/envs/mindspore_gpu/lib/python3.7/site-packages/mindspore/ops/composite/base.py:557
                        return grad_(fn_, weights)(*args, **kwargs)
                               ^
# 2 In file /opt/conda/envs/mindspore_gpu/lib/python3.7/site-packages/mindspore/ops/composite/base.py:501
            outputs = fn(*args)
                    

TypeError: For 'CrossEntropyLoss', the 'labels' should be Int32, but got 'mindspore.float32'.

In [None]:
from mindspore import Tensor
import numpy as np
net = nn.LSTM(10, 16, 2, has_bias=True, batch_first=True, bidirectional=False)
x = Tensor(np.ones([3, 5, 10]).astype(np.float32))
output, (hn, cn) = net(x)
print(output.shape)
print(hn.shape)
print(hn[0])
print(hn[1])


(3, 5, 16)
(2, 3, 16)
[[-0.14228317  0.30269402  0.12158372  0.04126829  0.41130123 -0.4313172
  -0.18862338 -0.09080917  0.35060525  0.23949553 -0.02401329 -0.3763776
   0.42173102 -0.41364804  0.09906068 -0.16490573]
 [-0.14228317  0.30269402  0.12158372  0.04126829  0.41130123 -0.4313172
  -0.18862338 -0.09080917  0.35060525  0.23949553 -0.02401329 -0.3763776
   0.42173102 -0.41364804  0.09906068 -0.16490573]
 [-0.14228317  0.30269402  0.12158372  0.04126829  0.41130123 -0.4313172
  -0.18862338 -0.09080917  0.35060525  0.23949553 -0.02401329 -0.3763776
   0.42173102 -0.41364804  0.09906068 -0.16490573]]
[[-0.02160999  0.03143664 -0.0843491   0.06857267 -0.07436486 -0.12577607
  -0.08058252  0.08969515  0.04423256  0.17154948  0.03809654  0.08853893
   0.09167541  0.10662405  0.05174439  0.11218565]
 [-0.02160999  0.03143664 -0.0843491   0.06857267 -0.07436486 -0.12577607
  -0.08058252  0.08969515  0.04423256  0.17154948  0.03809654  0.08853893
   0.09167541  0.10662405  0.05174439  