In [4]:
import sentencepiece as spm
import csv

In [2]:
#导入外部文件训练训练分词模型
spm.SentencePieceTrainer.train(
    input = 'XYJ.txt',
    model_prefix = "XYJ_mod",
    vocab_size=10000, 
    model_type='unigram',      # 或 'bpe'
    user_defined_symbols=['PAD','UNK']  # 自定义符号（可选）
)


In [None]:
sp = spm.SentencePieceProcessor()
sp.Load('XYJ_mod.model')

ds_comments = []
with open('C','r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        star = int(row['Star'])
        if star in [1,2,4,5]:
            comment = row["Comment"]
            #使用 EncodeAsIds 直接得到 ID
            words = sp.EncodeAsIds(comment)
            ds_comments.append((words, 1 if star > 2 else 0))

print(len(ds_comments))

In [None]:
ds_comments = [c for c in ds_comments if len(c[0]) in range(10, 150)]
print(len(ds_comments))

In [None]:
import pickle
with open('ds_comments.pkl','rb') as f:
        comments_data = pickle.load(f)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader


In [None]:

class Comments_Classifier(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_size,num_classes,pad_id=0): 
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_id)
        self.rnn = nn.LSTM(embedding_dim,hidden_size,batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids):
        # input_ids: (batch_size, seq_len)
        # embedded: (batch_size, seq_len, embedding_dim)
        embedded = self.embedding(input_ids)
        # output: (batch_size, seq_len, hidden_size)
        output, (hidden, _) = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        
        return output
    

if __name__ == '__main__':
    #设备配置
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #加载训练语料
    with open('ds_comments.pkl','rb') as f:
        comments_data = pickle.load(f)

    #构建词汇表
    sp = spm.SentencePieceProcessor()
    sp.Load('XYJ_mod.model')
    vocab_size = sp.GetPieceSize()# 直接获取词汇表大小

    #所有向量集合 Embedding（词嵌入）
    # emb = nn.Embedding(len(w2idx),100) # 词汇表大小，向量维度

    #回调函数，自定义数据转换方法
    #该函数会在每个batch数据加载时被调用
    def convert_data(batch_data):
        comments, votes = [],[]
        #分别提取评论和标签
        for comment,vote in batch_data:
            comments.append(torch.tensor( comment))
            votes.append(vote)

        #将评论和标签转换为tensor,# 使用 SentencePiece 的 pad_id
        commt = pad_sequence(comments, batch_first=True, padding_value=sp.pad_id())
        lables = torch.tensor(votes)
        return commt, lables
    #通过Dataset构建DataLoader
    dataloader = DataLoader(comments_data, batch_size=32, shuffle=True, collate_fn=convert_data)


    #模型参数
    vocab_size = vocab_size
    embedding_dim = 100
    hidden_size = 128
    num_classes = 2

    #构建模型
    model = Comments_Classifier(vocab_size, embedding_dim, hidden_size, num_classes).to(device)

    #损失函数
    criterion = nn.CrossEntropyLoss()
    #优化器
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    #训练模型
    num_epochs = 1
    for epoch in range(num_epochs):
        for i, (comm,lable) in enumerate(dataloader):
            #将数据移动到设备上
            comm = comm.to(device)
            lable = lable.to(device)

            #前向传播
            outputs = model(comm)
            #计算损失
            loss = criterion(outputs, lable)
            #反向传播
            optimizer.zero_grad()
            loss.backward()

            #梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            #更新参数
            optimizer.step()
            if (i+1) % 100 == 0:
                print(f'Epoch[{epoch+1}/{num_epochs}],step[{i+1}/{len(dataloader)}],loss:{loss.item():.4f}')


        #保存模型
    torch.save(model.state_dict(),'dmsc_comments_classifier_spm.pth')
            

In [None]:
#模型推理


embedding_dim = 100
hidden_size = 128
num_classes = 2

#加载模型
sp = spm.SentencePieceProcessor()
sp.Load('XYJ_mod.model')
vocab_size = sp.GetPieceSize()# 直接获取词汇表大小

# 测试模型
comment1 = '这部电影真好看！我很喜欢'
comment2 = '看到一半就不想看了，太无聊了，演员演技也很差'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 文本预处理函数（使用 SentencePiece 编码）
def text_to_tensor(text, sp_model, device):
    ids = sp_model.EncodeAsIds(text)  # 直接生成 ID 列表
    return torch.tensor([ids], dtype=torch.long).to(device)

# 将评论转换为索引
comment1_idx = text_to_tensor(comment1, sp, device)
comment2_idx = text_to_tensor(comment2, sp, device)

#加载模型
model = Comments_Classifier(vocab_size, embedding_dim, hidden_size, num_classes, sp.pad_id())
model.load_state_dict(torch.load('dmsc_comments_classifier_spm.pth', map_location=device))
model.to(device)
model.eval()

#模型推理
with torch.no_gard():
    pred1 = model(comment1_idx)
    pred2 = model(comment2_idx)

# 取最大值的索引作为预测结果
with torch.no_grad():
    pred1 = torch.argmax(pred1, dim=1).item()
    pred2 = torch.argmax(pred2, dim=1).item()

print(f'评论1的预测结果: {pred1}')
print(f'评论2的预测结果: {pred2}')