In [1]:
import pandas as pd
import jieba
import re
import json

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm.auto import tqdm


from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
# data = "./ChnSentiCorp_htl_all.csv"
data = "./weibo_senti_100k.csv"

pd_all = pd.read_csv(data, encoding='utf-8')

pd_positive = pd_all[pd_all.label==1]
pd_negative = pd_all[pd_all.label==0]

def get_balance_corpus(corpus_size, corpus_pos, corpus_neg):
    sample_size = corpus_size // 2
    pd_corpus_balance = pd.concat([corpus_pos.sample(sample_size, replace=corpus_pos.shape[0]<sample_size), \
                                   corpus_neg.sample(sample_size, replace=corpus_neg.shape[0]<sample_size)])
    
    print('评论数目（总体）：%d' % pd_corpus_balance.shape[0])
    print('评论数目（正向）：%d' % pd_corpus_balance[pd_corpus_balance.label==1].shape[0])
    print('评论数目（负向）：%d' % pd_corpus_balance[pd_corpus_balance.label==0].shape[0])    
    
    return pd_corpus_balance

pd_all_balance = get_balance_corpus(100000, pd_positive, pd_negative)

print(pd_all_balance.sample(10))

评论数目（总体）：100000
评论数目（正向）：50000
评论数目（负向）：50000
       label                                             review
31245      1  羡慕、嫉妒不恨！[嘻嘻] //@红星闪啊闪啊闪:哇塞，帅 //@旅伴箩箩-罗军: @奥神刘来...
36894      1               来呀来呀[可爱][可爱] //@Beryl-冰sunshine:转发微博
11404      1  阿布吉目前倒是定期照，可是十几岁以后粑粑就要抱不动了吧[哈哈] //@蜜思李珊珊珊:有女儿的...
17791      1                 这个可以有//@-罗大宁宁的小学妹er--: 是不是傻哈哈哈[哈哈]
87170      0  回复@柳树家的:我发现是瓶盖没拧好，斜着拧上的，包是黑的，倒是看不出来，主要是包里有条围巾，...
23777      1  [哈哈]开心！@我是王大磊大力相助解决了票务问题！孙氏剪纸捧回家来！特别隆重值得纪录：前日收...
3383       1  好主意[鼓掌]看来等我女儿长大了也送套厨房算了，房价实在太高了@xin_may[偷笑][偷笑...
39365      1                              有缘下次还会再见的[握手][鼓掌][鼓掌]
12006      1                        留个备用！[嘻嘻] //@红星闪闪大黑豆来了:转发微博
23536      1                                           [嘻嘻]好高大上


In [3]:

def clean_text(text):
    # 确保文本是字符串类型
    text = str(text)
    # 去除HTML标签
    text = re.sub(r'<.*?>', '', text)
    # 替换特殊字符和数字
    text = re.sub(r'[\r|\n|\\|0-9]', '', text)
    # 去除标点
    text = re.sub(r'[^\w\s]', '', text)
    return text

# 中文分词
def chinese_tokenization(text):
    return " ".join(jieba.cut(text))

# 去除停用词
def remove_stopwords(text):
    words = text.split()
    words_filtered = [word for word in words if word not in stopwords]
    return ' '.join(words_filtered)

pd_all_balance['review'] = pd_all_balance['review'].apply(clean_text)

pd_all_balance['review'] = pd_all_balance['review'].apply(chinese_tokenization)

stopwords = set(open('cn_stopwords.txt', 'r', encoding='utf-8').read().split())

pd_all_balance['review'] = pd_all_balance['review'].apply(remove_stopwords)

print(pd_all_balance.sample(10))





Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\YetGirt\AppData\Local\Temp\jieba.cache
Loading model cost 0.469 seconds.
Prefix dict has been built successfully.


       label                                             review
45634      1  嘻嘻 食神 咖喱 牛腩 鲍鱼 金汤 牛腩 面 冰火 岛 深海 龙虾 手 擀 凉面 威武 三款...
40931      1                                           接下去 鼓掌 力
20069      1  诬蔑 中 吴 晨光 中大 谈乱 爱 何江涛 中大 女生 富裕 一个 嘻嘻 光头 三爷 难 清...
57548      1                               可爱 柳絮 满天飞 应该 看看 耶 哈哈
91149      0                                   拜拜 北京 泪 首都 模式 开启
36513      1  芒果 免费送 澳洲 太 开心 阳光 海岸边 想 慕容 小妖精 mmno 许磊 人注 MS 贝...
53481      1  帅 QPlus 产品 团队 Q 壁纸 库 精美壁纸 心情 更换 w 网址 导航 WEB Q ...
14825      1  哈哈 谢谢 支持 爱 Nicole Gu 世界 金陵 豪包 高大 嘻嘻 嘻嘻 黄浩俊 How...
89452      0               衰 北京 知道 北京 事儿 跑 水 地儿 燕莎 桥下 目前 对主路 影响
61447      0              一天 屎尿 屁 没干 为啥 一个 辣 生物 会 蕴藏 辣 丰富 泪泪 泪泪


In [25]:

# 设置最大词汇量和序列长度
vocab_size = 5000
max_sequence_length = 16
min_word_frequency = 10


tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(pd_all_balance['review'])

# # 计算每个词的频率并过滤掉低频词
# filtered_words = {word: index for word, index in tokenizer.word_index.items() if tokenizer.word_counts[word] >= min_word_frequency}
# # 更新tokenizer的词汇表
# tokenizer.word_index = filtered_words
# tokenizer.num_words = len(filtered_words)

# 获取词汇表
word_index = tokenizer.word_index

# 保存词汇表到JSON文件
with open('word_index.json', 'w', encoding='utf-8') as f:
    json.dump(word_index, f, ensure_ascii=False, indent=4)

print("Word index saved to word_index.json")

sequences = tokenizer.texts_to_sequences(pd_all_balance['review'])

print('词汇量：%d' % len(tokenizer.word_index))

data = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# 划分数据集
labels = pd_all_balance['label'].values
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

print('训练集大小：%d' % len(X_train))
print(X_val[:3])
print(y_val[:15])



Word index saved to word_index.json
词汇量：189005
训练集大小：80000
[[  56  496 1982    3   64    0    0    0    0    0    0    0    0    0
     0    0]
 [  15  593   69 2688  232 1319  117  206 1496   36  136 1316 1433   16
    29   16]
 [  71    6    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
[0 0 1 0 0 0 0 0 0 0 1 1 1 1 1]


In [26]:

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [27]:

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_dim, num_layers, output_dim, dropout=0.2):
        super(TransformerModel, self).__init__()
        self.embed_size = embed_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = self.create_positional_encoding(max_len=16)  
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout),
            num_layers=num_layers
        )
        self.fc = nn.Linear(embed_size, output_dim)

    def create_positional_encoding(self, max_len):
        # 创建位置编码
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        i = torch.arange(0, self.embed_size, 2).float()
        angle_rates = 1 / torch.pow(10000, (i / self.embed_size))
        pos_encoding = torch.zeros(max_len, self.embed_size)
        pos_encoding[:, 0::2] = torch.sin(pos * angle_rates)
        pos_encoding[:, 1::2] = torch.cos(pos * angle_rates)
        pos_encoding = pos_encoding.unsqueeze(0)
        return nn.Parameter(pos_encoding, requires_grad=False)

    def forward(self, x):
        x = self.embed(x) + self.positional_encoding[:, :x.size(1)]
        x = self.transformer(x)
        x = x.mean(dim=1)  
        x = self.fc(x)
        return x

In [7]:

def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs, device):
    best_valid_loss = float('inf')
    
    for epoch in range(num_epochs):
        
        model.train()
        train_loss = 0
        train_correct = 0
        total_train = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
        
        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() 

            predicted = torch.sigmoid(outputs).squeeze() > 0.5
            train_correct += (predicted == labels).sum().item()
            total_train += labels.size(0)

            progress_bar.set_postfix({'train_loss': loss.item()})
        
        # 验证模型
        model.eval()
        valid_loss = 0
        valid_correct = 0
        total_valid = 0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                valid_loss += loss.item()
                predicted = torch.sigmoid(outputs).squeeze() > 0.5
                valid_correct += (predicted == labels).sum().item()
                total_valid += labels.size(0)

        valid_loss /= len(valid_loader)
        train_loss /= len(train_loader)

        train_acc = (train_correct / total_train) * 100
        valid_acc = (valid_correct / total_valid) * 100
        
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f} %,\
              Valid Loss: {valid_loss:.3f}, Valid Acc: {valid_acc:.2f} %')
        

        torch.save(model.state_dict(), f'model_{epoch+1}.pth')
        
        # 保存最好的模型
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print('Best model saved')


embed_size = 256
num_heads = 4
hidden_dim = 512
num_layers = 4
output_dim = 1
num_epochs = 10
learning_rate = 1e-4
weight_decay = 1e-4


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(vocab_size, embed_size, num_heads, hidden_dim, num_layers, output_dim)
model.to(device)
print(device)

# 定义损失函数和优化器
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()

# 开始训练
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)


cpu


Epoch 1/10:   0%|          | 0/1250 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [40]:

embed_size = 256
num_heads = 4
hidden_dim = 512
num_layers = 4
output_dim = 1
num_epochs = 10

# text = ""

# path = "./model_2.pth"
# path = "./best_model.pth"
path = "./model_3.pt"


model = TransformerModel(vocab_size, embed_size, num_heads, hidden_dim, num_layers, output_dim)
# model.load_state_dict(torch.load(path, map_location='cpu'))
model = torch.load(path, map_location='cpu')
model.eval()

with open('comments.txt', 'r', encoding='utf-8') as file:
    comments = file.readlines()

# 存储结果
results = []

# 处理每个评论并进行推理
for comment in comments:
    clean_comment = clean_text(comment.strip())
    tokenized_comment = chinese_tokenization(clean_comment)
    final_comment = remove_stopwords(tokenized_comment)
    # print("final_comment:",final_comment)
    sequences = tokenizer.texts_to_sequences([final_comment])

    # print("sequences:",sequences)
    data = pad_sequences(sequences, maxlen=16, padding='post')

    

    data = torch.tensor(data, dtype=torch.long)

    print("data:",data)

    # 转换为Tensor并输入模型
    result = model(data)
    probabilities = torch.sigmoid(result)
    predicted_classes = (probabilities >= 0.5).int()
    
    # 存储结果
    results.append((comment.strip(), probabilities.item(), '积极' if predicted_classes.item() == 1 else '消极'))

# 输出或保存结果
for comment, prob, sentiment in results:
    print(f"评论: {comment}\n情感概率: {prob:.3f}\n情感分类: {sentiment}\n")

# 保存结果到文件
with open('comments_with_sentiments.txt', 'w', encoding='utf-8') as file:
    for comment, prob, sentiment in results:
        file.write(f"评论: {comment}\n情感概率: {prob:.3f}\n情感分类: {sentiment}\n\n")


# clean = clean_text(text)

# print("清理后：" ,clean)

# tokenized = chinese_tokenization(clean)

# print("分词后：" ,tokenized)

# final_text = remove_stopwords(tokenized)

# print("去除停用词：" ,final_text)

# sequences = tokenizer.texts_to_sequences([final_text])
# data = pad_sequences(sequences, maxlen=32, padding='post')

# print("数字序列：",data)

# result = model(torch.tensor(data))
# probabilities = torch.sigmoid(result)
# predicted_classes = (probabilities >= 0.5).int()
# # print("probabilities:",0.973)
# # print("predicted_classes:","积极")


data: tensor([[2300,  865,   80,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
data: tensor([[1118,  247,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]])
data: tensor([[178,  36, 585,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0]])
data: tensor([[585,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0]])
评论: 优秀的女孩[赞]
情感概率: 0.512
情感分类: 积极

评论: 中专女生，那她得多努力才行[泪]
情感概率: 0.536
情感分类: 积极

评论: 我的天哪，这个是真的很厉害
情感概率: 0.525
情感分类: 积极

评论: 好厉害
情感概率: 0.518
情感分类: 积极



In [25]:
# import gradio as gr

# embed_size = 256
# num_heads = 4
# hidden_dim = 512
# num_layers = 4
# output_dim = 1
# num_epochs = 10
# learning_rate = 1e-4
# weight_decay = 1e-4


# path = 'model_3.pth'
# model = TransformerModel(vocab_size, embed_size, num_heads, hidden_dim, num_layers, output_dim)
# # model.load_state_dict(torch.load(path, map_location='cpu'))
# model.eval()


# def predict(text):
#     clean = clean_text(text)
#     tokenized = chinese_tokenization(clean)
#     final_text = remove_stopwords(tokenized)
#     sequences = tokenizer.texts_to_sequences([final_text])
#     data = pad_sequences(sequences, maxlen=16, padding='post')
#     result = model(torch.tensor(data))
#     probabilities = torch.sigmoid(result)
#     sentiment = "积极" if probabilities >= 0.5 else "消极"
#     return {"Sentiment": sentiment, "Probability": float(probabilities)}

# iface = gr.Interface(
#     fn=predict,
#     inputs=gr.inputs.Textbox(lines=4, placeholder="Enter Text Here..."),
#     outputs=[
#         gr.outputs.Label(num_top_classes=2)
#     ],
#     title="情感分析",
#     description="输入你的句子，我们将预测它的情感。"
# )

# iface.launch()

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",
  optional=optional,
  optional=optional,
  "Usage of gradio.outputs is deprecated, and will not be supported in the future, please import your components from gradio.components",
  super().__init__(num_top_classes=num_top_classes, type=type, label=label)


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




In [14]:
# import tkinter as tk
# from tkinter import filedialog,font

# embed_size = 256
# num_heads = 8
# hidden_dim = 1024
# num_layers = 4
# output_dim = 1
# num_epochs = 10
# learning_rate = 5e-5
# weight_decay = 1e-5


# def load_model(path):
#     model = TransformerModel(vocab_size, embed_size, num_heads, hidden_dim, num_layers, output_dim)
#     model.load_state_dict(torch.load(path, map_location='cpu'))
#     model.eval()
#     return model

# def infer():
#     # 获取输入并进行预处理
#     raw_text = text_entry.get()
#     clean = clean_text(raw_text)

#     print(clean)

#     tokenized = chinese_tokenization(clean)
#     final_text = remove_stopwords(tokenized)

#     print(final_text)

#     sequences = tokenizer.texts_to_sequences([final_text])
#     data = pad_sequences(sequences, maxlen=16, padding='post')

#     print(data)

#     if model:
#         result = model(torch.tensor(data))
#         probabilities = torch.sigmoid(result)
#         predicted_classes = (probabilities >= 0.5).int()
#         result_label.config(text=f"Probability: {probabilities.item():.3f}, Predicted Class: {predicted_classes.item()}")

#         print("result: ", result)

#         print(f"Probability: {probabilities.item():.3f}, Predicted Class: {predicted_classes.item()}")
        
#     else:
#         result_label.config(text="Please load a model first.")


# # 创建主窗口
# root = tk.Tk()
# root.title("Sentiment Analysis")
# root.configure(bg='#f0f0f0')  # 设置背景颜色

# # 定义一些美化的样式
# large_font = font.Font(family="Helvetica", size=14, weight="bold")
# button_font = font.Font(family="Helvetica", size=12)
# label_font = font.Font(family="Helvetica", size=12)

# # 创建一个文本输入框
# text_entry = tk.Entry(root, font=large_font, width=50)
# text_entry.pack(pady=20)

# # 选择模型文件
# def load_model_dialog():
#     filepath = filedialog.askopenfilename()
#     model_label.config(text=f"模型: {filepath.split('/')[-1]}")
#     global model
#     model = load_model(filepath)

# model_button = tk.Button(root, text="加载模型", command=load_model_dialog, font=button_font, bg='#dfe3ee', fg='black')
# model_button.pack(pady=10)

# model_label = tk.Label(root, text="No Model", font=label_font, bg='#f0f0f0', fg='red')
# model_label.pack(pady=5)

# infer_button = tk.Button(root, text="分析情感", command=infer, font=button_font, bg='#dfe3ee', fg='black')
# infer_button.pack(pady=10)

# # 显示结果的标签
# result_label = tk.Label(root, text="", font=label_font, bg='#f0f0f0', fg='blue')
# result_label.pack(pady=5)

# # 运行主循环
# root.mainloop()

Exception in Tkinter callback
Traceback (most recent call last):
  File "d:\Programs\miniconda3\envs\d2l_stu\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "C:\Users\YetGirt\AppData\Local\Temp\ipykernel_21728\3875919683.py", line 70, in load_model_dialog
    model = load_model(filepath)
  File "C:\Users\YetGirt\AppData\Local\Temp\ipykernel_21728\3875919683.py", line 16, in load_model
    model.load_state_dict(torch.load(path, map_location='cpu'))
  File "d:\Programs\miniconda3\envs\d2l_stu\lib\site-packages\torch\nn\modules\module.py", line 1672, in load_state_dict
    self.__class__.__name__, "\n\t".join(error_msgs)))
RuntimeError: Error(s) in loading state_dict for TransformerModel:
	size mismatch for transformer.layers.0.linear1.weight: copying a param with shape torch.Size([512, 256]) from checkpoint, the shape in current model is torch.Size([1024, 256]).
	size mismatch for transformer.layers.0.linear1.bias: copying a param with shape torch.Size

摊上大事了 曹增辉 衰 衰 这个太悲剧了
摊上 大事 曹 增辉 衰 衰 太 悲剧
[[4307 2222   16   16    8 1449    0    0    0    0    0    0    0    0
     0    0]]
result:  tensor([[0.0291]], grad_fn=<AddmmBackward0>)
Probability: 0.507, Predicted Class: 1
微博电影送福利 已点赞可爱那些岁月我们一起爱着的中国好声音可爱记得中国好声音的节目热播期间选手的表现和特质成了周围很多同事特别关心的话题而我自己看完第一时间播出的节目后还不满足还看了很多重播那不仅仅是中国好声音更是中国很多被埋没的好声音凤凰涅的中国梦
微博 电影 送 福利 已点 赞 可爱 岁月 一起 爱着 中国 声音 可爱 记得 中国 声音 节目 热播 期间 选手 表现 特质 成 周围 很多 同事 特别 关心 话题 完 第一 时间 播出 节目 满足 很多 重播 不仅仅 中国 声音 更是 中国 很多 埋没 声音 凤凰 涅 中国 梦
[[1306  239 1798  130 4155 5880   27  501 1761   27  130  501 1371 5593
    27  472]]
result:  tensor([[0.1195]], grad_fn=<AddmmBackward0>)
Probability: 0.530, Predicted Class: 1
