In [1]:
from src.dataset import EmailDataset
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from torch.utils.data import random_split, DataLoader
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split


In [2]:
data_root = "./data/trec06c-utf8"

emailSet = EmailDataset(data_root, using_cut=True)
print("Dataset size:", len(emailSet))

_, label = emailSet[0]
print("Sample label:", label)

  label        file_path
0  spam  ../data/000/000
1   ham  ../data/000/001
2  spam  ../data/000/002
3  spam  ../data/000/003
4  spam  ../data/000/004


Loading data: 3149it [00:00, 31480.42it/s]

Loading data: 64620it [00:02, 28752.40it/s]

Dataset size: 64620
Sample label: spam





In [None]:
df = emailSet.label_df
print(df["label"].isna().sum())        # 有多少个 NaN
print(df[df["label"].isna()].head())  

0
Empty DataFrame
Columns: [label, file_path]
Index: []


划分测试集和数据集

In [5]:
labels = emailSet.labels

idx = list(range(len(emailSet)))
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=114514, stratify=labels)

train_ds = Subset(emailSet, train_idx)
test_ds = Subset(emailSet, test_idx)
print(len(test_ds))

12924


朴素贝叶斯和词袋做文本分类

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X_train = [train_ds[i][0] for i in range(len(train_ds))]
y_train = [train_ds[i][1] for i in range(len(train_ds))]

X_test  = [test_ds[i][0] for i in range(len(test_ds))]
y_test  = [test_ds[i][1] for i in range(len(test_ds))]

model = Pipeline([
    ("vec", CountVectorizer(
        tokenizer=str.split,
        token_pattern=None,   
        lowercase=False
    )),
    ("nb", MultinomialNB(alpha=1.0))
])

# 3) 训练 + 预测
model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))


Accuracy: 0.982513153822346
[[4244  109]
 [ 117 8454]]
              precision    recall  f1-score   support

         ham       0.97      0.97      0.97      4353
        spam       0.99      0.99      0.99      8571

    accuracy                           0.98     12924
   macro avg       0.98      0.98      0.98     12924
weighted avg       0.98      0.98      0.98     12924



保存csv

In [7]:
y_test = [test_ds[i][1] for i in range(len(test_ds))]

# 计算是否预测正确：正确=1，错误=0
y_ture = [1 if p == y else 0 for p, y in zip(pred, y_test)]

df = pd.DataFrame({
    "id": list(range(1, len(test_ds) + 1)),
    "y_ture": y_ture,   
})

df.to_csv("result.csv", index=False, encoding="utf-8")
print(df.head())
print("saved:", len(df), "rows -> result.csv")

   id  y_ture
0   1       1
1   2       1
2   3       1
3   4       1
4   5       1
saved: 12924 rows -> result.csv
