In [1]:
from src.dataset import EmailDataset
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from torch.utils.data import random_split, DataLoader
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split


In [20]:
data_root = "./data/trec06c-utf8"

emailSet = EmailDataset(data_root, using_cut=True)
print("Dataset size:", len(emailSet))

_, label = emailSet[0]
print("Sample label:", label)

  label        file_path
0  spam  ../data/000/000
1   ham  ../data/000/001
2  spam  ../data/000/002
3  spam  ../data/000/003
4  spam  ../data/000/004


Loading data: 0it [00:00, ?it/s]

Loading data: 64620it [00:02, 29266.86it/s]

Dataset size: 64620
Sample label: spam





In [21]:
df = emailSet.label_df
print(df["label"].isna().sum())        # 有多少个 NaN
print(df[df["label"].isna()].head())  

0
Empty DataFrame
Columns: [label, file_path]
Index: []


划分测试集和数据集

In [22]:
labels = emailSet.labels

idx = list(range(len(emailSet)))
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=labels)

train_ds = Subset(emailSet, train_idx)
test_ds = Subset(emailSet, test_idx)
print(len(test_ds))

12924


朴素贝叶斯和词袋做文本分类
baseline

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X_train = [train_ds[i][0] for i in range(len(train_ds))]
y_train = [train_ds[i][1] for i in range(len(train_ds))]

X_test  = [test_ds[i][0] for i in range(len(test_ds))]
y_test  = [test_ds[i][1] for i in range(len(test_ds))]

model = Pipeline([
    ("vec", CountVectorizer(
        tokenizer=str.split,
        token_pattern=None,   
        lowercase=False
    )),
    ("nb", MultinomialNB(alpha=1.0))
])

# 3) 训练 + 预测
model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=6))


Accuracy: 0.9835964097802538
[[4256   97]
 [ 115 8456]]
              precision    recall  f1-score   support

         ham   0.973690  0.977717  0.975699      4353
        spam   0.988659  0.986583  0.987620      8571

    accuracy                       0.983596     12924
   macro avg   0.981175  0.982150  0.981659     12924
weighted avg   0.983617  0.983596  0.983605     12924



保存csv

In [34]:
y_test = [test_ds[i][1] for i in range(len(test_ds))]

# 计算是否预测正确：正确=1，错误=0
y_ture = [1 if p == y else 0 for p, y in zip(pred, y_test)]

df = pd.DataFrame({
    "id": list(range(1, len(test_ds) + 1)),
    "y_ture": y_ture,   
})

acc = accuracy_score(y_test, pred)

df.to_csv("result"+"_"+str(acc)+".csv", index=False, encoding="utf-8")
print(df.head())
print("saved:", len(df), "rows -> result.csv")

   id  y_ture
0   1       1
1   2       1
2   3       1
3   4       1
4   5       1
saved: 12924 rows -> result.csv


TF-IDF 代替纯词频

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X_train = [train_ds[i][0] for i in range(len(train_ds))]
y_train = [train_ds[i][1] for i in range(len(train_ds))]

X_test  = [test_ds[i][0] for i in range(len(test_ds))]
y_test  = [test_ds[i][1] for i in range(len(test_ds))]

model = Pipeline([
    ("vec", CountVectorizer(
        tokenizer=str.split,
        token_pattern=None,   
        lowercase=False,
        ngram_range=(1, 2),
        min_df = 2,
        max_df = 0.95,
    )),
    ("nb", MultinomialNB(alpha=0.1))
])

# 3) 训练 + 预测
model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=6))


Accuracy: 0.9955122253172393
[[4329   24]
 [  34 8537]]
              precision    recall  f1-score   support

         ham   0.992207  0.994487  0.993346      4353
        spam   0.997197  0.996033  0.996615      8571

    accuracy                       0.995512     12924
   macro avg   0.994702  0.995260  0.994980     12924
weighted avg   0.995516  0.995512  0.995513     12924



In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X_train = [train_ds[i][0] for i in range(len(train_ds))]
y_train = [train_ds[i][1] for i in range(len(train_ds))]

X_test  = [test_ds[i][0] for i in range(len(test_ds))]
y_test  = [test_ds[i][1] for i in range(len(test_ds))]

model = Pipeline([
    ("vec", TfidfVectorizer(
        tokenizer=str.split,
        token_pattern=None,
        lowercase=False,
        ngram_range=(1, 4),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True,   
        norm="l2"           
    )),
    ("nb", MultinomialNB(alpha=0.1))
])

model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=6))


Accuracy: 0.9963633549984525
[[4353    0]
 [  47 8524]]
              precision    recall  f1-score   support

         ham   0.989318  1.000000  0.994630      4353
        spam   1.000000  0.994516  0.997251      8571

    accuracy                       0.996363     12924
   macro avg   0.994659  0.997258  0.995941     12924
weighted avg   0.996402  0.996363  0.996368     12924

