In [1]:
import pandas as pd
import jieba
import re
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter

from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

In [2]:
data = "./weibo_senti_100k.csv"

writer = SummaryWriter(log_dir='logs')

pd_all = pd.read_csv(data, encoding='utf-8')

pd_positive = pd_all[pd_all.label==1]
pd_negative = pd_all[pd_all.label==0]

def get_balance_corpus(corpus_size, corpus_pos, corpus_neg):
    sample_size = corpus_size // 2
    pd_corpus_balance = pd.concat([corpus_pos.sample(sample_size, replace=corpus_pos.shape[0]<sample_size), \
                                   corpus_neg.sample(sample_size, replace=corpus_neg.shape[0]<sample_size)])
    
    print('评论数目（总体）：%d' % pd_corpus_balance.shape[0])
    print('评论数目（正向）：%d' % pd_corpus_balance[pd_corpus_balance.label==1].shape[0])
    print('评论数目（负向）：%d' % pd_corpus_balance[pd_corpus_balance.label==0].shape[0])    
    
    return pd_corpus_balance

pd_all_balance = get_balance_corpus(100000, pd_positive, pd_negative)

print(pd_all_balance.sample(10))

评论数目（总体）：100000
评论数目（正向）：50000
评论数目（负向）：50000
        label                                             review
17431       1                           [哈哈] 生日快乐！@偏心公公 @lilyli鑫
37773       1  [哈哈]争画家的指点到位，平常没怎么注意，这小子给了惊喜。 //@油画家Artistlee李...
27675       1  //@张新民: 回复@timesdance:不是不可以，凤凰炸豆干有好几种蘸料可选择，其中一...
42674       1  [微风]去春游？“趣”春游！#长峰童鞋#们，现在大家尽炫自我的机会到了！4月16日前，炫出你...
109202      0  我擦 //@风间小卒: @徐宝瑟  小心啦，担心的事情要发生了。 //@名犬杂志社:为什么五...
113735      0        //@张?://@小乖乖麻麻: 昨天去金山市场就看见这位老爷爷举牌再找，好可怜！[泪]
114564      0                           暹罗猫就是永远训不出来的节奏！！脾气忒大！[泪]
79963       0  回复@Seven快点实现梦想吧:?是才?。//@Seven快点实现梦想吧:蔡生我觉得赚钱好慢...
90862       0                           唉！这年头啊！现在滴小孩真滴是伤不起啊![抓狂]
25568       1  路边社最新消息，@光辉Ken 最#适合开的车#居然是【人力三轮车】，被我发现了[哈哈] @小...


In [3]:

def clean_text(text):
    # 确保文本是字符串类型
    text = str(text)
    # 去除HTML标签
    text = re.sub(r'<.*?>', '', text)
    # 替换特殊字符和数字
    text = re.sub(r'[\r|\n|\\|0-9]', '', text)
    # 去除标点
    text = re.sub(r'[^\w\s]', '', text)
    return text

# 中文分词
def chinese_tokenization(text):
    return " ".join(jieba.cut(text))

# 去除停用词
def remove_stopwords(text):
    words = text.split()
    words_filtered = [word for word in words if word not in stopwords]
    return ' '.join(words_filtered)

pd_all_balance['review'] = pd_all_balance['review'].apply(clean_text)

pd_all_balance['review'] = pd_all_balance['review'].apply(chinese_tokenization)

stopwords = set(open('cn_stopwords.txt', 'r', encoding='utf-8').read().split())

pd_all_balance['review'] = pd_all_balance['review'].apply(remove_stopwords)

print(pd_all_balance.sample(10))



Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\YetGirt\AppData\Local\Temp\jieba.cache
Loading model cost 0.451 seconds.
Prefix dict has been built successfully.


        label                                             review
33823       1                            哈哈 电影 大鱼 小说 版 没 看过 这点 转
30401       1                                祝福 祝福 百年好合 鼓掌 鼓掌 鼓掌
31567       1  一大早 收到 哥哥 生日 祝福 开心 今天 农历 生日 下周一 阳历 生日 二十五 咯 不能...
72388       0  泪 陈倩冰 泪毛 大庆 看了又看 潸然泪下 雅昌 艺术网 官方 微博 摄影 世界 这是 一个...
113031      0         死 老外 滚 出 中国 胡爷 励志 成为 女人 怒 文章 同人肉 丫 见 一次 一次
86442       0                                   转发 知道 全球 创意 搜罗 衰
55064       1  我刚 完 往复 评论 站 准备 扯 站 老 嬉皮 Pop 哈哈 捶 黄初 三年 仔细 完 往...
90088       0  这点 出门 走 分钟 没有 走出 我家 大门 我家 不住 故宫 点 之前 没到 真心 不能 ...
106555      0  大概 名 没能 出镜 负责 棚内 具体 生产 工作 大姐 包装 车间 大姐 负责 配送 司机...
2399        1                                           颜色 稍暗 嘻嘻


In [4]:
tokenizer_path = './cache/'

max_sequence_length = 32
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

# 对文本进行分词和转换
tokenized_texts = [tokenizer.encode(text, max_length=max_sequence_length, truncation=True) for text in pd_all_balance['review']]

print(pd_all_balance['review'][:5])
# print(tokenized_texts[:5])

# 获取标签
labels = pd_all_balance['label'].values

# 手动实现数据的padding
# 找出最长的序列长度
max_len = max_sequence_length

# 将所有序列padding到最长的序列长度
padded_sequences = []
for seq in tokenized_texts:
    if len(seq) < max_len:
        padded_seq = seq + [tokenizer.pad_token_id] * (max_len - len(seq))
    else:
        padded_seq = seq[:max_len]
    padded_sequences.append(padded_seq)

# 转换为PyTorch tensor
X_data = torch.tensor(padded_sequences, dtype=torch.long)
y_data = torch.tensor(labels, dtype=torch.float32)

print(X_data[:5])


16594    最近 带 两种 茶 铁观音 金骏眉 铁观音 属木 颜色 绿 兰花 香 金骏眉 感觉 属土 味...
57597                                        最后 一张 亮 嘻嘻 嘻嘻
44497        明星 李晨 今晚 住 朗廷 扬子 太 开心 激动 有木有 太 开心 兴奋 有木有 太 开心
58905      康 家村 村长 哈哈 早 知道 签名 哈哈 康 家村 村长 真 幸运 出门 碰见 高晓松 表哥
44440                                        名不虚传 国片 经典 哈哈
Name: review, dtype: object
tensor([[ 101, 3297, 6818, 2372,  697, 4905, 5763, 7188, 6225, 7509, 7032, 7742,
         4691, 7188, 6225, 7509, 2247, 3312, 7582, 5682, 5344, 1065, 5709, 7676,
         7032, 7742, 4691, 2697, 6230, 2247, 1759,  102],
        [ 101, 3297, 1400,  671, 2476,  778, 1677, 1677, 1677, 1677,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 3209, 3215, 3330, 3247,  791, 3241,  857, 3306, 2455, 2813, 2094,
         1922, 2458, 2552, 4080, 1220, 3300, 3312, 3300, 1922, 2458, 2552, 1069,
         1939, 3300, 3312, 3300, 1922, 2458, 2552,  102],
        [ 101, 24

In [5]:

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        input_ids = self.X[idx]
        attention_mask = (input_ids != 0).float()  # 生成 attention_mask
        labels = self.y[idx]
        return input_ids, attention_mask, labels

# 划分数据集
train_size = int(0.9 * len(X_data))
val_size = len(X_data) - train_size

train_dataset = TextDataset(X_data[:train_size], y_data[:train_size])
val_dataset = TextDataset(X_data[train_size:], y_data[train_size:])

# 定义DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [6]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs, device):
    best_valid_loss = float('inf')
    
    
    for epoch in range(num_epochs):
        
        model.train()
        train_loss = 0
        train_correct = 0
        total_train = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
        
        for inputs, attention_mask, labels in progress_bar:
            inputs = inputs.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            
            optimizer.zero_grad()
            outputs = model(inputs,attention_mask)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() 

            predicted = torch.sigmoid(outputs).squeeze() > 0.5
            train_correct += (predicted == labels).sum().item()
            total_train += labels.size(0)

            progress_bar.set_postfix({'train_loss': loss.item()})

            writer.add_scalar('Loss/train', loss.item(), epoch+1)
        
        # 验证模型
        model.eval()
        valid_loss = 0
        valid_correct = 0
        total_valid = 0
        
        with torch.no_grad():
            for inputs, attention_mask, labels in valid_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                attention_mask = attention_mask.to(device)
                outputs = model(inputs, attention_mask)
                loss = criterion(outputs.squeeze(), labels)
                valid_loss += loss.item()
                predicted = torch.sigmoid(outputs).squeeze() > 0.5
                valid_correct += (predicted == labels).sum().item()
                total_valid += labels.size(0)

                writer.add_scalar('Loss/valid', loss.item(), epoch+1)

        valid_loss /= len(valid_loader)
        train_loss /= len(train_loader)

        train_acc = (train_correct / total_train) * 100
        valid_acc = (valid_correct / total_valid) * 100
        
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f} %,\
              Valid Loss: {valid_loss:.3f}, Valid Acc: {valid_acc:.2f} %')
        

        torch.save(model.state_dict(), f'./model/model_{epoch+1}.pth')
        
        # 保存最好的模型
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), './model/best_model.pth')
            print('Best model saved')

In [7]:
# 修改模型结构
class SentimentClassifier(nn.Module):
    def __init__(self, pretrained_model_name, output_dim):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.25)
        self.fc = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # 取每个句子的第一个token的隐藏状态作为池化输出
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits


output_dim = 1
num_epochs = 5
learning_rate = 5e-6
weight_decay = 1e-4



# 加载预训练的BERT模型和分词器
model_path = "./cache"
# pretrained_model = BertModel.from_pretrained(model_path)

# 实例化模型
model = SentimentClassifier(model_path, output_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

# 定义损失函数和优化器
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
criterion = nn.BCEWithLogitsLoss()

# 开始训练
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)


Some weights of the model checkpoint at ./cache were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cpu


Epoch 1/5:   0%|          | 0/1407 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
import gradio as gr
import torch
from transformers import BertTokenizer

# 定义情感分析模型路径和其他参数
model_path = "./cache/"
path = "./model/model_4.pth"
# tokenizer = BertTokenizer.from_pretrained('')
max_sequence_length = 32 

# 加载情感分析模型
device = torch.device('cpu')  # 如果使用CPU进行推理
model = SentimentClassifier(model_path, output_dim=1)
model.load_state_dict(torch.load(path, map_location=device))
model.eval()


# 推理函数
def infer_sentiment(comments):

    comments = comments.splitlines()

    positive_count = 0
    negative_count = 0
    total_comments = len(comments)
    results = []

    for comment in comments:
        clean_comment = clean_text(comment.strip())  # 清理评论文本
        tokenized_comment = chinese_tokenization(clean_comment)
        final_comment = remove_stopwords(tokenized_comment)
        tokenized_comment = tokenizer(final_comment, padding='max_length', truncation=True, max_length=max_sequence_length)
        input_ids = tokenized_comment['input_ids']
        attention_mask = tokenized_comment['attention_mask']

        with torch.no_grad():
            input_ids = torch.tensor([input_ids]).to(device)
            attention_mask = torch.tensor([attention_mask]).to(device)

            result = model(input_ids, attention_mask)
            probabilities = torch.sigmoid(result)
            predicted_class = (probabilities >= 0.5).item()

            sentiment = 'Positive' if predicted_class == 1 else 'Negative'

            if sentiment == 'Positive':
                positive_count += 1
            else:
                negative_count += 1

            results.append({
                "评论": comment,
                "情感概率": f"{probabilities.item():.3f}",
                "情感分类": sentiment
            })

    positive_percentage = (positive_count / total_comments) * 100
    negative_percentage = (negative_count / total_comments) * 100

    return total_comments, positive_count, f"{positive_percentage:.2f}%", negative_count, f"{negative_percentage:.2f}%"

# Gradio 界面设置
iface = gr.Interface(
    fn=infer_sentiment,
    inputs=gr.Textbox(label="输入评论（每行一个）", type="text", lines=15),
    outputs=[
        gr.Textbox(label="总评论数", type="text"),
        gr.Textbox(label="积极情绪句子数量", type="text"),
        gr.Textbox(label="积极情绪百分比", type="text"),
        gr.Textbox(label="消极情绪句子数量", type="text"),
        gr.Textbox(label="消极情绪百分比", type="text")
    ],
    title="评论情感分析",
    description="输入多条评论，进行情感分析，统计积极和消极情绪数量及百分比。"
)

# 启动 Gradio 界面
iface.launch()



Some weights of the model checkpoint at ./cache/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: Error(s) in loading state_dict for SentimentClassifier:
	Missing key(s) in state_dict: "bert.embeddings.position_ids". 