In [None]:
pip install jieba

In [None]:
!pip install torch torchvision torchaudio --ignore-installed TBB

In [None]:
from collections import defaultdict
import jieba
import re
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
class SentimentClassifier:
    def __init__(self, dictionary_choice='Boson'):
        self.dictionary_choice = dictionary_choice
        self.degree_words = self.load_dictionary('degree')
        self.negative_words = self.load_dictionary('negative')
        self.stop_words = self.load_dictionary('stopwords')
        if dictionary_choice == 'Boson':
            self.sentiment_words = self.load_dictionary('sentiment')
        elif dictionary_choice == 'NTUSD':
            self.positive_words = self.load_dictionary('positive')
            self.negative_words = self.load_dictionary('negative')

    def load_dictionary(self, dict_type):
        path = './dictionary/'
        if dict_type == 'degree':
            with open(path + 'degree.txt', 'r', encoding='utf-8') as file:
                lines = file.readlines()
                degree_dict = defaultdict()
                for line in lines:
                    try:
                        word, score = line.strip().split(',')
                        degree_dict[word] = float(score)
                    except ValueError as e:
                        print(f"错误解析行: {line.strip()} - 错误信息: {e}")
                return degree_dict
        elif dict_type == 'sentiment':
            with open(path + 'sentiment_score.txt', 'r', encoding='utf-8') as file:
                lines = file.readlines()
                sentiment_dict = defaultdict()
                for line in lines:
                    parts = line.strip().split()
                    if len(parts) >= 2:
                        word, score = parts[0], parts[1]
                        try:
                            sentiment_dict[word] = float(score)
                        except ValueError as e:
                            print(f"无法转换 '{score}' 为浮点数 - 行: {line.strip()} - 错误信息: {e}")
                    else:
                        print(f"无效的行格式: {line.strip()}")
                return sentiment_dict
        elif dict_type in ["positive", "negative", 'stopwords']:
            with open(path + f'{dict_type}_simplified.txt', 'r', encoding='utf-8') as file:
                words = [line.strip() for line in file]
                return words
        return None

    def preprocess_reviews(self, reviews):
        """Preprocess the reviews: remove usernames, special characters, and perform tokenization."""
        preprocessed_reviews = []
        for review in reviews:
            review = re.sub('@.*?:', '', review)
            review = re.sub('@.*?：', '', review)
            review = re.sub(r'\W+', ' ', review).replace('_', ' ')
            jieba.initialize()  # 初始化jieba分词器
            words = jieba.lcut(review)
            filtered_words = [word for word in words if word not in self.stop_words and any('\u4e00' <= char <= '\u9fa5' for char in word)]
            preprocessed_reviews.append(' '.join(filtered_words))
        return preprocessed_reviews

    def calculate_sentiment_score(self, word_list):
        if self.dictionary_choice == 'Boson':
            return self.compute_boson_score(word_list)
        elif self.dictionary_choice == 'NTUSD':
            return self.compute_ntusd_score(word_list)

    def compute_boson_score(self, words):
        score = 0
        for idx, word in enumerate(words):
            if word in self.sentiment_words:
                temp_score = self.sentiment_words[word]
                for prev_word in words[:idx]:
                    if prev_word in self.negative_words:
                        temp_score *= -1
                    if prev_word in self.degree_words:
                        temp_score *= self.degree_words[prev_word]
                score += temp_score
        return score

    def compute_ntusd_score(self, words):
        score = 0
        for idx, word in enumerate(words):
            if word in self.positive_words:
                temp_score = 1
            elif word in self.negative_words:
                temp_score = -1
            else:
                continue

            for prev_word in words[:idx]:
                if prev_word in self.negative_words:
                    temp_score *= -1
                if prev_word in self.degree_words:
                    temp_score *= self.degree_words[prev_word]
            score += temp_score
        return score

    def analyze_sentiments(self, preprocessed_texts):
        scores, predictions = [], []
        for text in preprocessed_texts:
            words = text.split()
            #print(f"分词结果: {words}") 
            score = self.calculate_sentiment_score(words)
           #print(f"情感评分: {score}")  
            scores.append(score)
            predictions.append(1 if score > 0 else 0)
        return scores, predictions

    def evaluate_model(self, true_labels, predicted_labels):
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, zero_division=1)
        recall = recall_score(true_labels, predicted_labels, zero_division=1)
        f1 = f1_score(true_labels, predicted_labels, zero_division=1)
        print(f'准确率: {accuracy}\n精确率: {precision}\n召回率: {recall}\nF1分数: {f1}')

if __name__ == "__main__":
    data = pd.read_csv('./data/tieba_senti_10k.csv', encoding='latin1')
    reviews, labels = data['review'].values, data['label'].values

    classifier = SentimentClassifier()
    processed_reviews = classifier.preprocess_reviews(reviews)
    scores, predictions = classifier.analyze_sentiments(processed_reviews)
    classifier.evaluate_model(labels, predictions)