In [ ]:
import os
import jieba

def load_emails(folder_path):
    emails = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                words = jieba.lcut(content)
                emails.append(words)
    return emails

folder_path = 'email-text'
emails = load_emails(folder_path)


In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_words(emails, top_n=600):
    vectorizer = CountVectorizer(max_features=top_n, token_pattern=r'(?u)\b\w+\b')
    all_words = [' '.join(email) for email in emails]
    vectorizer.fit(all_words)
    top_words = vectorizer.get_feature_names_out()
    return top_words, vectorizer

top_words, vectorizer = get_top_words(emails)

def get_feature_vectors(emails, vectorizer):
    all_words = [' '.join(email) for email in emails]
    feature_vectors = vectorizer.transform(all_words).toarray()
    return feature_vectors

feature_vectors = get_feature_vectors(emails, vectorizer)


In [ ]:
from numpy import array

labels = array([1]*127 +[0]*24) #


In [ ]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import joblib

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labels, test_size=0.2, random_state=42)

# 创建并训练模型
model = MultinomialNB()
model.fit(X_train, y_train)

# 保存模型和前600个单词
joblib.dump(model, 'email_classifier_model.pkl')
joblib.dump(top_words, 'top_600_words.pkl')


In [ ]:
# 加载模型和前600个单词
model = joblib.load('email_classifier_model.pkl')
top_words = joblib.load('top_600_words.pkl')
vectorizer = CountVectorizer(vocabulary=top_words, token_pattern=r'(?u)\b\w+\b')

def load_test_emails(folder_path):
    test_emails = load_emails(folder_path)
    return test_emails

test_folder_path = 'path/to/test/email/folder'
test_emails = load_test_emails(test_folder_path)
test_feature_vectors = get_feature_vectors(test_emails, vectorizer)

# 进行预测
predictions = model.predict(test_feature_vectors)


In [ ]:
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score

def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    # 对于二分类问题
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, auc

test_labels_file = 'path/to/test/label/file.txt'
test_labels = load_labels(test_labels_file)

accuracy, precision, auc = evaluate_model(test_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'AUC: {auc:.2f}')
