In [1]:
!pip install -q scikit-learn pandas

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import os
import re

In [6]:
# 载入数据集
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!wget $url -O aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

--2024-05-30 13:05:01--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-05-30 13:05:08 (11.7 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [7]:
def load_data(directory):
    data = []
    for label in ['pos', 'neg']:
        path = os.path.join(directory, label)
        for filename in os.listdir(path):
            if filename.endswith(".txt"):
                with open(os.path.join(path, filename), encoding='utf-8') as f:
                    review = f.read()
                    data.append((review, 1 if label == 'pos' else 0))
    return pd.DataFrame(data, columns=['review', 'sentiment'])

# 加载训练数据和测试数据
train_data = load_data("aclImdb/train")
test_data = load_data("aclImdb/test")

# 查看数据集
train_data.head()

Unnamed: 0,review,sentiment
0,"Found this flick in a videostore, it cost $2 t...",1
1,"This movie surprised me. Some things were ""cli...",1
2,I saw the movie with two grown children. Altho...,1
3,This is one of the funniest and most excellent...,1
4,"I used to watch this show as a child, and I lo...",1


In [8]:
# 数据预处理
X_train = train_data['review']
y_train = train_data['sentiment']
X_test = test_data['review']
y_test = test_data['sentiment']

In [9]:
# 使用CountVectorizer进行特征向量化
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

# 使用TfidfVectorizer进行特征向量化
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [10]:
# 使用RandomForestClassifier进行分类并评估
rf_clf_count = RandomForestClassifier(random_state=42)
rf_clf_count.fit(X_train_counts, y_train)
y_pred_rf_count = rf_clf_count.predict(X_test_counts)
print("RandomForest with CountVectorizer Accuracy:", accuracy_score(y_test, y_pred_rf_count))
print(classification_report(y_test, y_pred_rf_count))

rf_clf_tfidf = RandomForestClassifier(random_state=42)
rf_clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_rf_tfidf = rf_clf_tfidf.predict(X_test_tfidf)
print("RandomForest with TfidfVectorizer Accuracy:", accuracy_score(y_test, y_pred_rf_tfidf))
print(classification_report(y_test, y_pred_rf_tfidf))

# 使用LogisticRegression进行分类并评估
lr_clf_count = LogisticRegression(max_iter=1000, random_state=42)
lr_clf_count.fit(X_train_counts, y_train)
y_pred_lr_count = lr_clf_count.predict(X_test_counts)
print("LogisticRegression with CountVectorizer Accuracy:", accuracy_score(y_test, y_pred_lr_count))
print(classification_report(y_test, y_pred_lr_count))

lr_clf_tfidf = LogisticRegression(max_iter=1000, random_state=42)
lr_clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_clf_tfidf.predict(X_test_tfidf)
print("LogisticRegression with TfidfVectorizer Accuracy:", accuracy_score(y_test, y_pred_lr_tfidf))
print(classification_report(y_test, y_pred_lr_tfidf))

RandomForest with CountVectorizer Accuracy: 0.84536
              precision    recall  f1-score   support

           0       0.84      0.85      0.85     12500
           1       0.85      0.84      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

RandomForest with TfidfVectorizer Accuracy: 0.83812
              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12500
           1       0.85      0.83      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

LogisticRegression with CountVectorizer Accuracy: 0.86668
              precision    recall  f1-score   support

           0       0.86      0.87      0.87     12500
           1       0.87      0.86      0.87     12500

    accuracy        

In [None]:
# 比较结果
results = {
    "RandomForest + CountVectorizer": accuracy_score(y_test, y_pred_rf_count),
    "RandomForest + TfidfVectorizer": accuracy_score(y_test, y_pred_rf_tfidf),
    "LogisticRegression + CountVectorizer": accuracy_score(y_test, y_pred_lr_count),
    "LogisticRegression + TfidfVectorizer": accuracy_score(y_test, y_pred_lr_tfidf)
}

best_combination = max(results, key=results.get)
print("最佳组合是：", best_combination)