In [2]:
import jieba
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange, tqdm

In [3]:
# 数据预处理
vocab = {}
vocab['[PAD]'] = 0
# 参数设置
device = 'cuda' if torch.cuda.is_available() else 'cpu'
seq_len = 25  # 每句话的长度
embedding_size = 150  # word2vec的维度
kernel_num = 75  # 每一种卷积核的个数
Kernel_list = [2, 3, 4, 5,6]  # N-gram
class_num = 9  # 分类的个数
epoches = 100  #训练次数
lr = 0.001  # 学习率

In [4]:
#词向量分配（每一个词都对应一个数字）
def pro_vocab(path):
    with open(path, 'r', encoding='utf-8') as f:
        L = [e.strip() for e in f.readlines()]
        for news in L:
            news = news[2:]
            for word in news:
                if word not in vocab:
                    vocab[word] = len(vocab)

In [5]:
pro_vocab('train.txt')
pro_vocab('test.txt')

vocab_size = len(vocab)

In [6]:
def word2ids(text):  # 将数据转化成数字id 并多退少补
    ids = [vocab[word] for word in text]
    if len(ids) < seq_len:
        ids += [0] * (seq_len - len(ids))
        return ids
    else:
        return ids[:seq_len]

In [7]:
def load_data(path):  # 读取训练集和测试集
    with open(path, 'r', encoding='utf-8') as f:
        turples = [(int(sentence[0]), word2ids(sentence[2:].strip()))
                   for sentence in f.readlines()]
    labels = []
    texts = []
    for turple in turples:
        labels.append(turple[0])
        texts.append(turple[1])
    # 返回标签和经过预处理的文本
    return texts, labels

In [8]:
train_x, train_y = load_data('train.txt')
test_x, test_y = load_data('test.txt')
train_x = torch.LongTensor(train_x).to(device)
train_y = torch.LongTensor(train_y).to(device)
test_x = torch.LongTensor(test_x).to(device)
test_y = torch.LongTensor(test_y).to(device)

In [9]:
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        V = vocab_size
        E = embedding_size
        Ci = 1  # 输入数据的通道数
        Co = kernel_num  # 每一种卷积核的数目
        Kl = Kernel_list  # N-gram
        C = class_num  #输出的维度

        self.embed = nn.Embedding(V, E)
        self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, E)) for K in Kl])
        self.fc = nn.Linear(len(Kl) * Co, C)

    def forward(self, x):
        x = self.embed(x)  # (N, seq_len, E)

        x = x.unsqueeze(1)  # (N, Ci = 1, seq_len, E)

        x = [F.relu(conv(x)).squeeze(3)
             for conv in self.convs]  # [(N, Co, seq_len-ki+1), ...]*len(Ks)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2)
             for i in x]  # [(N, Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        out = F.softmax(self.fc(x), dim=1)
        return out

In [None]:
model = TextCNN().to(device)  #实例化模型
criterion = nn.CrossEntropyLoss().to(device)  #定义损失函数，为交叉熵损失
optimizer = torch.optim.Adam(model.parameters(), lr=lr)  #使用随机梯度下降算法，学习率为lr

for epoch in trange(epoches): #训练过程
    pred = model(train_x)
    loss = criterion(pred, train_y)
    if (epoch + 1) % 100 == 0:
        print(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [10]:
pred_y = model(test_x)
L = pred_y.tolist()
total = len(L)
acc = 0
for i, pred in enumerate(L):
    max_index = pred.index(max(pred))
    if max_index == test_y[i]:
        acc += 1

print(acc / total * 100, '%')

  7%|█████▏                                                                        | 99/1500 [10:04<2:37:03,  6.73s/it]

1.6990107297897339


 13%|██████████▏                                                                  | 199/1500 [20:10<2:07:34,  5.88s/it]

1.5174521207809448


 20%|███████████████▎                                                             | 299/1500 [31:02<2:14:07,  6.70s/it]

1.5160112380981445


 27%|████████████████████▍                                                        | 399/1500 [40:52<1:43:23,  5.63s/it]

1.5155375003814697


 33%|█████████████████████████▌                                                   | 499/1500 [51:45<1:44:02,  6.24s/it]

1.5148073434829712


 40%|█████████████████████████████▉                                             | 599/1500 [1:01:25<1:21:40,  5.44s/it]

1.5146516561508179


 47%|██████████████████████████████████▉                                        | 699/1500 [1:10:47<1:08:17,  5.11s/it]

1.5145577192306519


 50%|█████████████████████████████████████▎                                     | 747/1500 [1:15:24<1:16:00,  6.06s/it]


KeyboardInterrupt: 

In [None]:
with open('text_4.txt', 'r', encoding='utf-8') as f:
        turples = [(int(sentence[0]), word2ids(sentence[2:].strip()))
                   for sentence in f.readlines()]
    labels = []
    texts = []
    for turple in turples:
        labels.append(turple[0])
        texts.append(turple[1])

In [17]:
import pandas as pd
import csv

In [20]:
f = pd.read_csv('test_4.txt',sep='\t',names=["编号", "内容"])
f

Unnamed: 0,编号,内容
0,3875,陈凯：致力盔甲工艺的文化传承
1,2999,预付卡资金“打水漂”？ 消费者可以这样维权
2,356,一键生成100个商标名称 阿里向社会免费开放AI商标注册机器人
3,483,比利时新增1580例新冠肺炎病例 累计确诊24983例
4,130,马来西亚星洲网：马来西亚中医药抗疫小组发布安全指南
...,...,...
595,2091,北京做好准备确保初高三如期开学 完善“一校一案”
596,1083,龚琳娜在云南深山采风 感慨民间声乐一辈子也学不完
597,2014,北约陷入“走钢丝”窘境
598,2445,云南省陇川县章凤镇曼农村突发山火
