In [6]:
from scipy import sparse
import re
from sklearn.datasets import load_files
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression


In [2]:
# 正则表达式
def clean_text(doc):
    doc = doc.replace(b"<br />", b" ")  # 替换换行符
    doc = re.sub(b"<.*?>", b" ", doc)  # 移除 HTML 标签
    doc = re.sub(b"[^\w\s]", b" ", doc)  # 移除标点符号
    doc = re.sub(b"\d+", b" ", doc)  # 移除数字
    doc = re.sub(b"\s+", b" ", doc).strip()  # 移除多余空格
    return doc

In [3]:
reviews_train = load_files("data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
text_train = [clean_text(doc) for doc in text_train]

In [4]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
text_test = [clean_text(doc) for doc in text_test]

In [5]:
# 加载稀疏矩阵
X_train_tfidf = sparse.load_npz("X_train_tfidf.npz")
X_test_tfidf = sparse.load_npz("X_test_tfidf.npz")

In [None]:
# 使用贝叶斯进行分类
Lmodel_nb = MultinomialNB()
Lmodel_nb.fit(X_train_tfidf, y_train)
y_pred_nb = Lmodel_nb.predict(X_test_tfidf)
print("Naive Bayes Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_nb)))

# 使用随机森林分类器进行分类
Lmodel_rf = RandomForestClassifier(n_estimators=100, random_state=42)
Lmodel_rf.fit(X_train_tfidf, y_train)
y_pred_rf = Lmodel_rf.predict(X_test_tfidf)
print("Random Forest Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_rf)))

# 使用梯度提升分类器进行分类
Lmodel_gb = GradientBoostingClassifier()
Lmodel_gb.fit(X_train_tfidf, y_train)
y_pred_gb = Lmodel_gb.predict(X_test_tfidf)
print("Gradient Boosting Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_gb)))

# 使用决策树分类器进行分类
Lmodel_dt = DecisionTreeClassifier(random_state=42)
Lmodel_dt.fit(X_train_tfidf, y_train)
y_pred_dt = Lmodel_dt.predict(X_test_tfidf)
print("Decision Tree Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_dt)))

# 使用逻辑回归分类器进行分类
Lmodel_lr = LogisticRegression(max_iter=1000, random_state=42)
Lmodel_lr.fit(X_train_tfidf, y_train)
y_pred_lr = Lmodel_lr.predict(X_test_tfidf)
print("Logistic Regression Test accuracy: {:.3f}".format(accuracy_score(y_test, y_pred_lr)))