# 基于深度学习的文本分类

使用基于mindspore框架的自然语言处理库mindnlp

主仓库地址：https://github.com/mindspore-lab/mindnlp

# 构造数据集

根据所提供的news20数据集，

In [1]:
"""
news20 load function
"""
import os
from tqdm import tqdm
from mindspore.dataset import GeneratorDataset

label_dict = {}
label_count = 0

class News20:
    """
    NEWS dataset source
    """

    def __init__(self, path)->None:
        self.path: str = path
        self._text, self._label = [], []
        self.label_dict = {}
        self.label_count = 0
        self._load()

    def _load(self):
        labels = os.listdir(self.path)
        for label in labels:
            self.label_dict[label] = self.label_count
            self.label_count += 1
            label_path = os.path.join(self.path, label)
            with tqdm(total = len(os.listdir(label_path)), desc = label) as pbar:
                for t in os.listdir(label_path):
                    pbar.update(1)
                    text_path = os.path.join(label_path, t)
                    file = open(text_path, "r" ,encoding="latin-1") # latin-1
                    text = file.read().strip()
                    begin = text.find('\n\n')
                    if 0 < begin: # skip head
                        text = text[begin:]
                    self._text.append(text)
                    self._label.append(self.label_dict[label])

    def __getitem__(self, index):
        return self._text[index], self._label[index]

    def __len__(self):
        return len(self._label)

def news20(source:News20):
    r"""
    Load the news20 dataset
    """
    column_names = ["text", "label"]
    return GeneratorDataset(source = source, column_names = column_names, shuffle = False) # 不shuffle，划分数据集时会进行打乱


In [2]:
# 载入数据
dataset_source = News20(r"G:\_CQU\3-1\自然语言处理\实验\实验二\NLP\news20\20_newsgroup")
print(dataset_source.label_dict)
dataset = news20(dataset_source)

alt.atheism: 100%|██████████| 1000/1000 [00:00<00:00, 10898.66it/s]
comp.graphics: 100%|██████████| 1000/1000 [00:00<00:00, 11186.60it/s]
comp.os.ms-windows.misc: 100%|██████████| 1000/1000 [00:00<00:00, 11936.70it/s]
comp.sys.ibm.pc.hardware: 100%|██████████| 1000/1000 [00:00<00:00, 11796.33it/s]
comp.sys.mac.hardware: 100%|██████████| 1000/1000 [00:00<00:00, 11936.70it/s]
comp.windows.x: 100%|██████████| 1000/1000 [00:00<00:00, 11526.55it/s]
misc.forsale: 100%|██████████| 1000/1000 [00:00<00:00, 11140.84it/s]
rec.autos: 100%|██████████| 1000/1000 [00:00<00:00, 11936.66it/s]
rec.motorcycles: 100%|██████████| 1000/1000 [00:00<00:00, 11934.56it/s]
rec.sport.baseball: 100%|██████████| 1000/1000 [00:00<00:00, 11326.10it/s]
rec.sport.hockey: 100%|██████████| 1000/1000 [00:00<00:00, 11140.69it/s]
sci.crypt: 100%|██████████| 1000/1000 [00:00<00:00, 11394.13it/s]
sci.electronics: 100%|██████████| 1000/1000 [00:00<00:00, 12692.32it/s]
sci.med: 100%|██████████| 1000/1000 [00:00<00:00, 12674.94i

{'alt.atheism': 0, 'comp.graphics': 1, 'comp.os.ms-windows.misc': 2, 'comp.sys.ibm.pc.hardware': 3, 'comp.sys.mac.hardware': 4, 'comp.windows.x': 5, 'misc.forsale': 6, 'rec.autos': 7, 'rec.motorcycles': 8, 'rec.sport.baseball': 9, 'rec.sport.hockey': 10, 'sci.crypt': 11, 'sci.electronics': 12, 'sci.med': 13, 'sci.space': 14, 'soc.religion.christian': 15, 'talk.politics.guns': 16, 'talk.politics.mideast': 17, 'talk.politics.misc': 18, 'talk.religion.misc': 19}


In [3]:
# define Models & Loss & Optimizer
hidden_size = 256
output_size = 20
num_layers = 2
bidirectional = True
drop = 0.5
lr = 0.001

In [4]:
# 数据预处理
import mindspore
from mindspore.dataset import text, transforms
from mindnlp.modules import Glove
from mindnlp.dataset.transforms import BasicTokenizer
from mindnlp.dataset import process

def data_process(dataset, tokenizer, vocab, batch_size = 64, max_len = 1000, drop_remainder = False):
    """
    预处理
    """
    # print("!!!!!!!!!!!!!!!!!!!!!!!")
    dataset = dataset.map([tokenizer], 'text')
    lookup_op = text.Lookup(vocab, unknown_token = '<unk>')
    dataset = dataset.map([lookup_op], 'text')
    pad_value = vocab.tokens_to_ids('<pad>')
    pad_op = transforms.PadEnd([max_len], pad_value)
    dataset = dataset.map([pad_op], 'text')
    onehot_op = transforms.OneHot(num_classes=20)
    dataset = dataset.map([onehot_op], 'label')
    type_cast_op = transforms.TypeCast(mindspore.float32)
    dataset = dataset.map([type_cast_op], 'label')

    dataset = dataset.batch(batch_size, drop_remainder = drop_remainder)
    return dataset

tokenizer = BasicTokenizer(True)
embedding, vocab = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"], dropout=drop)

dataset_process = data_process(dataset, tokenizer, vocab) # 其余使用默认参数
# dataset_process = process('imdb', dataset, tokenizer=tokenizer, vocab = vocab, max_len = 1000, drop_remainder = True)

In [5]:
# 划分数据集
train_dataset, test_dataset = dataset_process.split([0.7, 0.3])
print(type(train_dataset))
train_iter = train_dataset.create_dict_iterator()
print(type(train_iter))
print(next(train_iter))

<class 'mindspore.dataset.engine.datasets.TakeDataset'>
<class 'mindspore.dataset.engine.iterators.DictIterator'>
{'text': Tensor(shape=[64, 1000], dtype=Int32, value=
[[  277,    47,    43 ...     1,     1,     1],
 [    8,  1761, 19797 ...     1,     1,     1],
 [ 2751,  2751,    43 ...     1,     1,     1],
 ...
 [ 2751,  2751,   262 ...     1,     1,     1],
 [ 8609,     0,     3 ...     1,     1,     1],
 [   88,  1546,   277 ...     1,     1,     1]]), 'label': Tensor(shape=[64, 20], dtype=Float32, value=
[[0.00000000e+000, 0.00000000e+000, 0.00000000e+000 ... 0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
 [0.00000000e+000, 0.00000000e+000, 0.00000000e+000 ... 0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
 [0.00000000e+000, 0.00000000e+000, 0.00000000e+000 ... 0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
 ...
 [0.00000000e+000, 0.00000000e+000, 0.00000000e+000 ... 0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
 [0.00000000e+000, 0.00000000e+000, 0.00000000

In [6]:
# 构建网络
import math

from mindspore import nn
from mindspore import ops
from mindspore.common.initializer import Uniform, HeUniform
from mindnlp.abc import Seq2vecModel

class Head(nn.Cell):
    """
    Head for Classification model
    """
    def __init__(self, hidden_dim, output_dim, dropout):
        super().__init__()
        weight_init = HeUniform(math.sqrt(5))
        bias_init = Uniform(1 / math.sqrt(hidden_dim * 2))
        self.fc = nn.Dense(hidden_dim * 2, output_dim, weight_init=weight_init, bias_init=bias_init)
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(1 - dropout)

    def construct(self, context):
        context = ops.concat((context[-2, :, :], context[-1, :, :]), axis=1)
        context = self.dropout(context)
        return self.softmax(self.fc(context))


class Classification(Seq2vecModel):
    """
    Classification model
    """
    def __init__(self, encoder, head):
        super().__init__(encoder, head)
        self.encoder = encoder
        self.head = head

    def construct(self, text):
        _, (hidden, _), _ = self.encoder(text)
        output = self.head(hidden)
        return output

In [7]:
from mindnlp.modules import RNNEncoder
from mindnlp.engine.metrics import Accuracy
from mindnlp.engine.trainer import Trainer

lstm_layer = nn.LSTM(100, hidden_size, num_layers=num_layers, batch_first=True,
                     dropout=drop, bidirectional=bidirectional)
encoder = RNNEncoder(embedding, lstm_layer)
head = Head(hidden_size, output_size, drop)

net = Classification(encoder, head)
loss = nn.CrossEntropyLoss()
optimizer = nn.Adam(net.trainable_params(), learning_rate=lr)

# define metrics
metric = Accuracy()

# define trainer
trainer = Trainer(network=net, train_dataset = dataset_process, eval_dataset = dataset_process, metrics=metric,
                  epochs=5, loss_fn=loss, optimizer=optimizer)
trainer.run(tgt_columns="label", jit=False)
print("end train")

Epoch 0:  11%|█         | 34/313 [20:27<2:53:27, 37.30s/it, loss=2.7716334]




Epoch 0:  11%|█         | 34/313 [21:04<2:52:53, 37.18s/it, loss=2.7716334]


KeyboardInterrupt: 

In [None]:
train_dataset.create_dict_iterator()
next(train_dataset.create_dict_iterator())

RuntimeError: Unexpected error. map operation: [Lookup] failed. Lookup: input is not string datatype.
Line of code : 28
File         : mindspore\ccsrc\minddata\dataset\text\kernels\lookup_op.cc


In [None]:
PATH = r"G:\_CQU\3-1\自然语言处理\实验\实验二\news20\20_newsgroup"
LABELS = os.listdir(PATH)
ALT = os.path.join(PATH,LABELS[0])
TEXT_PATH = os.path.join(ALT,os.listdir(ALT)[0])
TEXT_PATH

'G:\\_CQU\\3-1\\自然语言处理\\实验\\实验二\\news20\\20_newsgroup\\alt.atheism\\49960'

In [None]:
file = open(TEXT_PATH, "r",encoding="latin-1")
text = file.read()
begin = text.find('\n\n') # skip header
if 0 < begin:
    text = text[begin:]
text



'\n\nArchive-name: atheism/resources\nAlt-atheism-archive-name: resources\nLast-modified: 11 December 1992\nVersion: 1.0\n\n                              Atheist Resources\n\n                      Addresses of Atheist Organizations\n\n                                     USA\n\nFREEDOM FROM RELIGION FOUNDATION\n\nDarwin fish bumper stickers and assorted other atheist paraphernalia are\navailable from the Freedom From Religion Foundation in the US.\n\nWrite to:  FFRF, P.O. Box 750, Madison, WI 53701.\nTelephone: (608) 256-8900\n\nEVOLUTION DESIGNS\n\nEvolution Designs sell the "Darwin fish".  It\'s a fish symbol, like the ones\nChristians stick on their cars, but with feet and the word "Darwin" written\ninside.  The deluxe moulded 3D plastic fish is $4.95 postpaid in the US.\n\nWrite to:  Evolution Designs, 7119 Laurel Canyon #4, North Hollywood,\n           CA 91605.\n\nPeople in the San Francisco Bay area can get Darwin Fish from Lynn Gold --\ntry mailing <figmo@netcom.com>.  For net 