# 基于深度学习的文本分类

使用基于mindspore框架的自然语言处理库mindnlp

主仓库地址：https://github.com/mindspore-lab/mindnlp

# 构造数据集

根据所提供的news20数据集，

In [10]:
mindspore.set_context(device_target="GPU") # set GPU
"""
news20 load function
"""
import os
from tqdm import tqdm
from mindspore.dataset import GeneratorDataset
import mindspore
import numpy as np

label_dict = {}
label_count = 0

class News20:
    """
    NEWS dataset source
    """

    def __init__(self, path)->None:
        self.path: str = path
        self._text, self._label = [], []
        self.label_dict = {}
        self.label_count = 0
        self._load()

    def _load(self):
        labels = os.listdir(self.path)
        for label in labels:
            label_path = os.path.join(self.path, label)
            if not os.path.isdir(label_path):      # 如果不是一个文件夹就不算
                continue
            self.label_dict[label] = self.label_count
            self.label_count += 1
            
            with tqdm(total = len(os.listdir(label_path)), desc = label) as pbar:
                for t in os.listdir(label_path):
                    pbar.update(1)
                    if not t.isdigit():
                        continue
                    text_path = os.path.join(label_path, t)
                    import sys
                    args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                    with open(text_path, **args) as file: # latin-1
                        text = file.read().strip()
                        begin = text.find('\n\n')
                        if 0 < begin: # skip head
                            text = text[begin:]
                        if type(text) is str and len(text) > 0:
                            self._text.append(text)
                            self._label.append(self.label_dict[label])
        for x in self._text:
            if type(x) is not str:
                print("!!!!!!!!!!!!!!!!")

    def __getitem__(self, index):
        # print(index)
        # if type(self._text[index]) is not str:
        #     print("Nope!") 
        #     print(type(self._text[index]))
        return np.array(self._text[index], dtype=str), np.array(self._label[index], dtype=int)

    def __len__(self):
        return len(self._label)

def news20(source:News20):
    r"""
    Load the news20 dataset
    """
    column_names = ["text", "label"]
    return GeneratorDataset(source = source, shuffle=False, column_names = column_names, column_types = [mindspore.string, mindspore.int32], python_multiprocessing=False) # 不shuffle，划分数据集时会进行打乱


In [11]:
# 载入数据
dataset_source = News20(r"G:\_CQU\3-1\自然语言处理\实验\实验二\NLP\news20\20_newsgroup")
print(dataset_source.label_dict)
dataset = news20(dataset_source)

alt.atheism: 100%|██████████| 1000/1000 [00:00<00:00, 13368.90it/s]
comp.graphics: 100%|██████████| 1000/1000 [00:00<00:00, 11936.46it/s]
comp.os.ms-windows.misc: 100%|██████████| 1000/1000 [00:00<00:00, 13735.29it/s]
comp.sys.ibm.pc.hardware: 100%|██████████| 1000/1000 [00:00<00:00, 13818.46it/s]
comp.sys.mac.hardware: 100%|██████████| 1000/1000 [00:00<00:00, 14122.05it/s]
comp.windows.x: 100%|██████████| 1000/1000 [00:00<00:00, 12532.66it/s]
misc.forsale: 100%|██████████| 1000/1000 [00:00<00:00, 14122.14it/s]
rec.autos: 100%|██████████| 1000/1000 [00:00<00:00, 13191.79it/s]
rec.motorcycles: 100%|██████████| 1000/1000 [00:00<00:00, 13549.94it/s]
rec.sport.baseball: 100%|██████████| 1000/1000 [00:00<00:00, 14122.24it/s]
rec.sport.hockey: 100%|██████████| 1000/1000 [00:00<00:00, 13926.01it/s]
sci.crypt: 100%|██████████| 1000/1000 [00:00<00:00, 14324.03it/s]
sci.electronics: 100%|██████████| 1000/1000 [00:00<00:00, 14203.58it/s]
sci.med: 100%|██████████| 1000/1000 [00:00<00:00, 14122.33i

{'alt.atheism': 0, 'comp.graphics': 1, 'comp.os.ms-windows.misc': 2, 'comp.sys.ibm.pc.hardware': 3, 'comp.sys.mac.hardware': 4, 'comp.windows.x': 5, 'misc.forsale': 6, 'rec.autos': 7, 'rec.motorcycles': 8, 'rec.sport.baseball': 9, 'rec.sport.hockey': 10, 'sci.crypt': 11, 'sci.electronics': 12, 'sci.med': 13, 'sci.space': 14, 'soc.religion.christian': 15, 'talk.politics.guns': 16, 'talk.politics.mideast': 17, 'talk.politics.misc': 18, 'talk.religion.misc': 19}





In [12]:
print(len(dataset_source))

19997


In [13]:
# define Models & Loss & Optimizer
hidden_size = 256
output_size = 20
num_layers = 2
bidirectional = True
drop = 0.5
lr = 0.001

In [14]:
# 数据预处理
import mindspore
from mindspore.dataset import text, transforms
from mindnlp.modules import Glove
from mindnlp.dataset.transforms import BasicTokenizer
from mindnlp.dataset import process

tokenizer = BasicTokenizer(True)
embedding, vocab = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"], dropout=drop)

In [15]:
def data_process(dataset, tokenizer, vocab, batch_size = 64, max_len = 1000, drop_remainder = False):
    """
    预处理
    """
    # print("!!!!!!!!!!!!!!!!!!!!!!!")
    dataset = dataset.map([tokenizer], 'text')

    lookup_op = text.Lookup(vocab, unknown_token = '<unk>')
    dataset = dataset.map([lookup_op], 'text')

    pad_value = vocab.tokens_to_ids('<pad>')
    pad_op = transforms.PadEnd([max_len], pad_value)
    dataset = dataset.map([pad_op], 'text')

    onehot_op = transforms.OneHot(num_classes=20)
    dataset = dataset.map([onehot_op], 'label')

    type_cast_op = transforms.TypeCast(mindspore.float32)
    dataset = dataset.map([type_cast_op], 'label')
    
    dataset = dataset.batch(batch_size, drop_remainder = drop_remainder)
    # print("bye!!!!!")
    return dataset



dataset_process = data_process(dataset, tokenizer=tokenizer, vocab = vocab, batch_size=16)
# print("hi")

# for x in dataset_p.create_dict_iterator():
#     print("Done!")

# dataset_process = data_process(dataset, tokenizer, vocab) # 其余使用默认参数
# dataset_process = process('imdb', dataset, tokenizer=tokenizer, vocab = vocab, max_len = 1000, drop_remainder = True)

In [16]:
# iter_t = dataset_process.create_dict_iterator()
# print(type(iter_t))
# print(next(iter_t))

In [17]:
# 划分数据集
train_dataset, test_dataset = dataset_process.split([0.7, 0.3])
# print(train_dataset.shape)
# print(type(dataset_process))
# train_iter = train_dataset.create_dict_iterator()
# train_iter = iter(train_dataset)
# train_dataset_p = data_process(train_dataset, tokenizer=tokenizer, vocab = vocab)
# test_dataset_p = data_process(test_dataset, tokenizer=tokenizer, vocab = vocab)

In [18]:
# train_iter = train_dataset.create_dict_iterator()
# print(type(train_iter))
# for x in dataset.create_dict_iterator():
#     print(x)

In [19]:
train_dataset

<mindspore.dataset.engine.datasets.TakeDataset at 0x2547f159f48>

In [20]:
# 构建网络
import math

from mindspore import nn
from mindspore import ops
from mindspore.common.initializer import Uniform, HeUniform
from mindnlp.abc import Seq2vecModel

class Head(nn.Cell):
    """
    Head for Classification model
    """
    def __init__(self, hidden_dim, output_dim, dropout):
        super().__init__()
        weight_init = HeUniform(math.sqrt(5))
        bias_init = Uniform(1 / math.sqrt(hidden_dim * 2))
        self.fc = nn.Dense(hidden_dim * 2, output_dim, weight_init=weight_init, bias_init=bias_init)
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(1 - dropout)

    def construct(self, context):
        context = ops.concat((context[-2, :, :], context[-1, :, :]), axis=1)
        context = self.dropout(context)
        return self.softmax(self.fc(context))


class Classification(Seq2vecModel):
    """
    Classification model
    """
    def __init__(self, encoder, head):
        super().__init__(encoder, head)
        self.encoder = encoder
        self.head = head

    def construct(self, text):
        _, (hidden, _), _ = self.encoder(text)
        output = self.head(hidden)
        return output

In [21]:
from mindnlp.modules import RNNEncoder
from mindnlp.engine.metrics import Accuracy
from mindnlp.engine.trainer import Trainer


lstm_layer = nn.LSTM(100, hidden_size, num_layers=num_layers, batch_first=True,
                     dropout=drop, bidirectional=bidirectional)
encoder = RNNEncoder(embedding, lstm_layer)
head = Head(hidden_size, output_size, drop)

net = Classification(encoder, head)
loss = nn.CrossEntropyLoss()
optimizer = nn.Adam(net.trainable_params(), learning_rate=lr)

# define metrics
metric = Accuracy()

# define trainer
trainer = Trainer(network=net, train_dataset = train_dataset, eval_dataset = test_dataset, metrics=metric,
                  epochs=5, loss_fn=loss, optimizer=optimizer)
trainer.run(tgt_columns="label", jit=False)
print("end train")

Epoch 0:   7%|▋         | 57/875 [08:12<1:57:44,  8.64s/it, loss=2.997643] 







KeyboardInterrupt: 

In [None]:
train_dataset.create_dict_iterator()
next(train_dataset.create_dict_iterator())

In [None]:
PATH = r"G:\_CQU\3-1\自然语言处理\实验\实验二\news20\20_newsgroup"
LABELS = os.listdir(PATH)
ALT = os.path.join(PATH,LABELS[0])
TEXT_PATH = os.path.join(ALT,os.listdir(ALT)[0])
TEXT_PATH

In [None]:
file = open(TEXT_PATH, "r",encoding="latin-1")
text = file.read()
begin = text.find('\n\n') # skip header
if 0 < begin:
    text = text[begin:]
text

