# B站：神奇的布欧
# 微信：l1243278923

In [1]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import numpy as np
from sklearn.model_selection import train_test_split
import re
import jieba
from collections import Counter

# 0. 判断GPU是否可用

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 1. 数据预处理

In [3]:
# 数据路径
dataPath = "./chineseComment/"
# 路径拼接
goodFile = dataPath + "good.txt"
badFile = dataPath + "bad.txt"

In [4]:
# 过滤标点符号 函数
def filter_punc(sentence):
    sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\'“”《》?“]+|[+——！，。？、~@#￥%……&*（）：]+", "", sentence)
    return (sentence)

In [5]:
def prepareData(good_file, bad_file, is_filter=True):
    all_words = list()
    pos_sentences = list()
    neg_sentences = list()
    with open(good_file, 'r', encoding='utf-8') as f_goog:
        for index, line in enumerate(f_goog):
            if is_filter:
                line = filter_punc(line)
            words = jieba.lcut(line)
            if len(words) > 0:
                all_words += words
                pos_sentences.append(words)
    print(f"{good_file}包函{index + 1}条数据, {len(all_words)}个词语。")
    count = len(all_words)
    with open(bad_file, 'r', encoding='utf-8') as f_bad:
        for index, line in enumerate(f_bad):
            if is_filter:
                line = filter_punc(line)
            words = jieba.lcut(line)
            if len(words) > 0:
                all_words += words
                neg_sentences.append(words)
    print(f"{bad_file}包函{index + 1}条数据, {len(all_words) - count}个词语。")
    diction = dict()
    cnt = Counter(all_words)
    for word, freq in cnt.items():
        diction[word] = [len(diction), freq]
    print("字典大小：", len(diction))
    return pos_sentences, neg_sentences, diction

In [6]:
# word2index
def word2index(word, diction):
    if word in diction:
        value = diction[word][0]
    else:
        value = -1
    return value


# index2word
def index2word(index, diction):
    for w, v in diction.items():
        if v[0] == index:
            return w
    return None

In [7]:
pos_sentences, neg_sentences, diction = prepareData(goodFile, badFile, True)
st = sorted([(v[1], w) for w, v in diction.items()])
st

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\刘玉昆\AppData\Local\Temp\jieba.cache
Loading model cost 0.310 seconds.
Prefix dict has been built successfully.


./chineseComment/good.txt包函8089条数据, 100839个词语。
./chineseComment/bad.txt包函5076条数据, 56070个词语。
字典大小： 7135


[(1, '---'),
 (1, '------'),
 (1, '000000000'),
 (1, '0000000000000000'),
 (1, '00000000000000000'),
 (1, '0000000000000000000000'),
 (1, '000000000000000000000000000'),
 (1, '00000000000000000000000000000000000000000000000'),
 (1, '1006'),
 (1, '111'),
 (1, '1111111'),
 (1, '11111111111'),
 (1, '11111111111111'),
 (1, '111111111111111111111111'),
 (1, '11111111111111111111111111111'),
 (1, '111111111111111111111111111111111'),
 (1, '1111111111111111111111111111111111111111111'),
 (1, '115'),
 (1, '122'),
 (1, '12315'),
 (1, '123456'),
 (1, '1255888'),
 (1, '128'),
 (1, '130'),
 (1, '136'),
 (1, '138'),
 (1, '15548'),
 (1, '158105'),
 (1, '160'),
 (1, '16067cm'),
 (1, '165140'),
 (1, '165cm'),
 (1, '166cm68kgxl'),
 (1, '16860'),
 (1, '170CM55'),
 (1, '170M'),
 (1, '170cm75kg'),
 (1, '171819202122'),
 (1, '172'),
 (1, '172153'),
 (1, '175cm'),
 (1, '1805'),
 (1, '180xxl'),
 (1, '183'),
 (1, '185'),
 (1, '1852XL'),
 (1, '185XXL'),
 (1, '185mm'),
 (1, '200000000'),
 (1, '20161218'),
 (1, 

In [8]:
def sentence2vec(sentence, diction):
    vector = np.zeros(len(diction))
    for l in sentence:
        vector[l] += 1
    return (1.0 * vector / len(sentence))

In [9]:
dataset = list()
labels = list()
sentences = list()
for sentence in pos_sentences:
    new_sentence = list()
    for pos_word in sentence:
        if pos_word in diction:
            new_sentence.append(word2index(pos_word, diction))
    dataset.append(sentence2vec(new_sentence, diction))
    labels.append(0)
    sentences.append(sentence)
for sentence in neg_sentences:
    new_sentence = list()
    for neg_word in sentence:
        if neg_word in diction:
            new_sentence.append(word2index(neg_word, diction))
    dataset.append(sentence2vec(new_sentence, diction))
    labels.append(1)
    sentences.append(sentence)

In [10]:
# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.1, random_state=56)

In [12]:
# 转换为张量
train_xt = torch.Tensor(X_train).to(dtype=torch.float32)
train_yt = torch.Tensor(y_train).to(dtype=torch.long)
test_xt = torch.Tensor(X_test).to(dtype=torch.float32)
test_yt = torch.Tensor(y_test).to(dtype=torch.long)

  train_xt = torch.Tensor(X_train).to(dtype=torch.float32)


In [15]:
# 数据集加载器
train_data = Data.TensorDataset(train_xt, train_yt)

In [16]:
train_loader = Data.DataLoader(dataset=train_data,
                               batch_size=32,
                               shuffle=True,
                               drop_last=True)

# 2. 搭建网络模型

In [17]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(len(diction), 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
            nn.Softmax(dim=1)
        )
    def forward(self, x):
        return self.fc(x)

In [18]:
model = Model().to(device)

In [19]:
model

Model(
  (fc): Sequential(
    (0): Linear(in_features=7135, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=2, bias=True)
    (5): Softmax(dim=1)
  )
)

# 3. 训练

In [20]:
def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fun = nn.CrossEntropyLoss()
    model.train()
    for epoch in range(10):
        for step, (x, y) in enumerate(train_data):
            out = model(torch.unsqueeze(x,dim=0).to(device))
            loss = loss_fun(torch.squeeze(out.to("cpu")), y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        if epoch % 1 == 0:
            print(f"epoch:{epoch}, 损失：{loss.item()}")
    torch.save(model, "./7.中文情绪分类.model")

In [21]:
train()

epoch:0, 损失：0.34375816583633423
epoch:1, 损失：0.314322829246521
epoch:2, 损失：0.31333956122398376
epoch:3, 损失：0.3132733404636383
epoch:4, 损失：0.31326383352279663
epoch:5, 损失：0.3132621645927429
epoch:6, 损失：0.31326183676719666
epoch:7, 损失：0.3132617473602295
epoch:8, 损失：0.31326165795326233
epoch:9, 损失：0.31326165795326233


In [22]:
@torch.no_grad()
def test(test_xt):
    model = torch.load("./7.中文情绪分类.model").to("cpu")
    model.eval()
    out = model(test_xt)
    return out

In [23]:
predit = test(test_xt)

In [24]:
countTrue = 0
for i in range(len(test_yt)):
    if torch.argmax(predit[i]) == test_yt[i]:
        countTrue += 1
print("准确率：" + str(countTrue / len(test_yt)))

准确率：0.9118098159509203
