In [1]:
import jieba
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def clean_symbols(text):
    text = re.sub('[!！]+', "!", text)
    text = re.sub('[？?]+', "?", text)
    text = re.sub("[a-zA-Z#$%&\'()*+,-./:;：<=>@，。*、…【】《》\"\'[\\]^_`{|}~]+", " 00V ", text)
    return re.sub("\s+", " ", text).strip()

# 读取训练集评论
with open('../DB/train_positive.txt', 'r', encoding='utf-8') as f:
    train_positive = f.read().splitlines()
with open('../DB/train_negative.txt', 'r', encoding='utf-8') as f:
    train_negative = f.read().splitlines()

# 读取测试集评论
with open('../DB/test_combined.txt', 'r', encoding='utf-8') as f:
    test_comments = f.read().splitlines()

# 读取停用词
stopword_pth = "../DB/stopwords.txt"
with open(stopword_pth, 'r', encoding='utf-8') as f:
    stopword = f.read().splitlines()


In [2]:
train_comments = train_positive + train_negative
train_comment_cleaned = []

for train_comment in train_comments:
    # 1. 调用 clean_symbols 处理 train_comment 的特殊符号
    train_comment = clean_symbols(train_comment)
    # 2. 调用 re.sub 将 train_comment 数字换成特殊符号 ' NUM '
    train_comment = re.sub('[1234567890]+', ' NUM ', train_comment)
    # 3. jieba.cut 对 train_comment 进行分词
    cut_train = jieba.cut(train_comment)
    # 4. 去除停用词
    seg_train = [word for word in cut_train if word not in stopword]
    # 5. 重新组合成字符串
    train_comment_cleaned.append(" ".join(seg_train))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Kaidi\AppData\Local\Temp\jieba.cache
Loading model cost 0.683 seconds.
Prefix dict has been built successfully.


In [3]:
test_comment_cleaned = []
y_test = []
for line in test_comments:
    line = line.strip()
    # 检查是否是包含 label 的行
    if line.startswith('<review'):
        # print("Get it: "+line)
        # 提取标签信息
        label_match = re.search(r'label="(\d)"', line)
        if label_match:
            label = int(label_match.group(1))
            y_test.append(label)
            # print("Get label: "+str(label))
    elif line and 'review' not in line:
        # 处理评论内容
        test_comment = line
        test_comment = clean_symbols(test_comment)
        test_comment = re.sub('[1234567890]+', ' NUM ', test_comment)
        cut_test = jieba.cut(test_comment)
        seg_test = [word for word in cut_test if word not in stopword]
        # 将清洗后的评论文本加入到 test_comment_cleaned 列表中
        test_comment_cleaned.append(" ".join(seg_test))

In [4]:
# 打印前5条处理后的评论
for i in range(0, 5):
    print(train_comment_cleaned[i])
print("-----------------------------")
for i in range(0, 5):
    print(test_comment_cleaned[i])

  NUM   V     NUM   V     NUM       NUM   V

请问 这机 不是 有个 遥控器

  NUM   V
-----------------------------
终于 找到 同道中人 初中     NUM   V   已经 喜欢 上     NUM   V   同学 都 鄙夷 眼光 看     NUM   V   人为     NUM   V   样子 古怪 说 ＂ 丑 ＂ 当场 气晕 现在 同道中人     NUM   V   好开心 !     NUM   V   !     NUM   V
看 完 已 深夜 两点     NUM   V   却 坐在 电脑前 情难 自禁     NUM   V   这是 最好 结局     NUM   V   惟有     NUM   V   就让 前世 今生 纠结 停留 此刻     NUM   V   再 相逢 时     NUM   V   愿 人生 不再 人 唏嘘     NUM   V   身心 会 只 居 一处     NUM   V   痛心 人     NUM   V   爱     NUM   V
袁阔成 先生 当今 评书 界 泰斗     NUM   V   十二 金钱 镖 代表作
确实 非常 不错     NUM   V   物有所值
基本上 说 诈骗


In [5]:
# 准备标签
y_train = [1] * len(train_positive) + [0] * len(train_negative)

# 特征提取
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_comment_cleaned)
X_test = vectorizer.transform(test_comment_cleaned)


# 打印特征矩阵和标签的形状
print("X_train shape:", X_train.shape)
print("y_train shape:", len(y_train))
print("X_test shape:", X_test.shape)
print("y_test shape:", len(y_test))
# print(X_test)
# print(y_test)


X_train shape: (54093, 23100)
y_train shape: 54093
X_test shape: (2500, 23100)
y_test shape: 2500


In [6]:
# 1
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

para_c = np.logspace(-5, 2, 15)
model = GridSearchCV(estimator= LogisticRegression(penalty='l2'), param_grid={'C':para_c}, scoring='f1', cv=10)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
print("Report of LogisticRegression")
print(classification_report(y_test, prediction))

Report of LogisticRegression
              precision    recall  f1-score   support

           0       0.86      0.52      0.65      1250
           1       0.66      0.92      0.77      1250

    accuracy                           0.72      2500
   macro avg       0.76      0.72      0.71      2500
weighted avg       0.76      0.72      0.71      2500


In [7]:
# 2
from sklearn.naive_bayes import MultinomialNB
params_alpha = [1]
model = GridSearchCV(estimator=MultinomialNB(fit_prior=True), param_grid = {'alpha':params_alpha}, scoring='f1', cv=10)
model.fit(X_train, y_train)
predic = model.predict(X_test)
print("Report of NaiveBayes")
print(classification_report(y_test, predic))

Report of NaiveBayes
              precision    recall  f1-score   support

           0       0.92      0.39      0.54      1250
           1       0.61      0.96      0.75      1250

    accuracy                           0.68      2500
   macro avg       0.76      0.68      0.65      2500
weighted avg       0.76      0.68      0.65      2500


In [8]:
# 3
from sklearn import svm
params = [{'kernel':['linear'], 'C':[1, 10]},{'kernel':['poly'], 'C':[1]}, {'kernel':['rbf'], 'C':[10], 'gamma':[1, 0.1]}]
model = GridSearchCV(estimator=svm.SVC(), param_grid=params, scoring='f1', cv=5)
model.fit(X_train,y_train)
prediction = model.predict(X_test)
print("Report of SVM")
print(classification_report(y_test, prediction))

KeyboardInterrupt: 