In [1]:
import os
import re
from collections import Counter
from email import parser, policy
from html import unescape

import nltk
import pandas
import urlextract
from nltk.corpus import stopwords
from sklearn import metrics, preprocessing, naive_bayes
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from email.parser import BytesParser

import joblib
# import sys
# print(sys.executable)

In [2]:
# 读取数据集
INDEX_PATH = os.path.join('trec07p', 'test', 'index')  # 先使用较小的数据集进行训练
DATA_PATH = os.path.join('trec07p', 'data')  # 数据文件夹路径
labels = []
filenames = []

In [3]:
# 将delay中的标签和对应的文件名保存
def create_dataset(index_path):
    with open(index_path) as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = line.split(' ')
            labels.append(line[0])
            filenames.append(line[1].strip('\n').split('/')[-1])
            
create_dataset(INDEX_PATH)

In [4]:
# 下载delay中索引的文件内容
def load_email(filename, file_path):
    with open(os.path.join(file_path, filename), 'rb') as f:
        return parser.BytesParser(policy=policy.default).parse(f)

raw_emails = [load_email(name, DATA_PATH) for name in filenames]

In [5]:
print(raw_emails[3].get_content().strip())  # 打印邮件文本内容，注意这里输出内容不一定是正确的，只有当邮件为文本类型，才能输出；不能正确输出说明邮件是多部份的

Hey Billy, 

it was really fun going out the other night 
and talking, while we were out you said that you felt
insecure about your manhood, I noticed in the toilets
you were quite small in that area : ) , but not to 
worry.. that website that I was telling you about is 
my secret weapon to an extra 3 inches, trust me.. girls
love bigger ones, I've had 5 times as many chicks 
since I used these pills a year ago. The package I used
was the 6 month supply one,  and its worth every 
cent and more.. the website is http://ctmay.com 
Ring me on the weekend and we will go out and drink 
again and let you know some more secrets : ).
Later dude, Brad


In [6]:
# 数据预处理
# 构造函数获取邮件的结构类型及其计数

# 确定每封电子邮件的结构类型
def get_email_structure(email):
    if isinstance(email, str): # 字符串直接返回
        return email
    payload = email.get_payload() # 提取email的主体部分
    if isinstance(payload, list): # 如果是列表，说明该邮件为多部份
        return 'multipart({})'.format(', '.join([get_email_structure(sub_email) for sub_email in payload]))
        # 如果有两个子部分，一个是 text/plain，另一个是 text/html，那么最终的结果将是 'multipart(text/plain, text/html)'
    else: # 否则一般是text/plain或text/html
        return email.get_content_type()

In [7]:
# 统计一组电子邮件中各种结构类型的出现次数
def structures_counter(emails):
    structures = Counter() # 字典类型
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [8]:
print(structures_counter(raw_emails).most_common())  # 显示邮件包含的类型
"""
这对于分析电子邮件数据集的结构分布非常有用，可以帮助我们理解数据集中最常见的电子邮件类型，从而为进一步的数据处理和特征工程提供信息。
例如，如果多数邮件都是纯文本类型，那么我们可能会专注于文本内容的分析；如果有大量的多部分邮件，我们可能需要考虑如何处理嵌入的图片或附件。
"""

[('text/plain', 411), ('multipart(text/plain, text/html)', 206), ('text/html', 139), ('multipart(multipart(text/plain, text/html), image/gif)', 88), ('multipart(multipart(text/plain, text/html), image/jpeg)', 65), ('multipart(text/html)', 32), ('multipart(text/html, image/gif)', 20), ('multipart(text/plain)', 17), ('multipart(text/plain, application/x-msdownload)', 9), ('multipart(multipart(text/plain, text/html), image/png)', 3), ('multipart(text/plain, application/pgp-signature)', 3), ('multipart(text/plain, multipart(text/plain), text/plain)', 1), ('multipart(text/plain, text/plain)', 1), ('multipart(text/plain, application/octet-stream)', 1), ('multipart(text/html, image/jpeg)', 1), ('multipart(text/plain, text/x-patch)', 1), ('multipart(multipart(text/plain, text/html), image/gif, image/gif, image/jpeg, application/octet-stream)', 1), ('multipart(multipart(text/plain, text/html), image/gif, image/gif, image/gif, image/gif, image/jpeg, image/gif, application/octet-stream, image/gif

'\n这对于分析电子邮件数据集的结构分布非常有用，可以帮助我们理解数据集中最常见的电子邮件类型，从而为进一步的数据处理和特征工程提供信息。\n例如，如果多数邮件都是纯文本类型，那么我们可能会专注于文本内容的分析；如果有大量的多部分邮件，我们可能需要考虑如何处理嵌入的图片或附件。\n'

In [9]:
# 将原始的电子邮件内容转换为更适合文本分析和机器学习模型训练的格式

# 将HTML内容转换为纯文本，同时替换为相应的关键词
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub(r'<[aA]\s.*?>', 'HYPERLINK', text, flags=re.M | re.S | re.I)
    text = re.sub(r'<img\s.*?>', 'IMAGE', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [10]:
# 用于从电子邮件中提取文本内容
def email_to_text(email):
    html = None
    # walk()打印出一封具有多部分结构之信息的每个部分的MIME类型
    for part in email.walk():
        ctype = part.get_content_type()
        if ctype not in ('text/plain', 'text/html'):
            continue
        try:
            content = part.get_content()
        except LookupError:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [11]:
# 下载 stopwords 资源
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\86180\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# 分词
stopwords_list = stopwords.words('english')  # 英文停用词列表
token = nltk.stem.SnowballStemmer('english')  # 提取词干，词干提取器对象，用于将单词还原为基本形式或词干。例如，将“running”还原为“run”。

# 将所有小写英文字母添加到停用词列表
for single in range(97, 123):
    stopwords_list.append(chr(single))
    
extractor = urlextract.URLExtract() # 创建了一个URL提取器对象，用于从文本中找出URL

In [13]:
# 将电子邮件文本转换为一个清洗和标准化的单词列表
def word_split(email):
    text = email_to_text(email) or ' '
    text = text.lower()
    text = re.sub(r'\W+', ' ', text, flags=re.M) # 使用正则表达式替换文本中的所有非字母数字字符为单个空格
    urls = list(set(extractor.find_urls(text))) # 一个去重的URL列表
    urls.sort(key=lambda item: len(item), reverse=True) # 将找到的URL按长度降序排序
    for url in urls:
        text = text.replace(url, "URL") # 将文本中的所有URL替换为特征词“URL”
    text = re.sub(r'\d+(?:\.\d*[eE]\d+)?', 'NUMBER', text) # 使用正则表达式将文本中的所有数字替换为字符串“NUMBER”
    content = list(nltk.word_tokenize(text)) # 使用NLTK的 word_tokenize 函数将文本分割成单词列表
    all_words = []
    for word in content:
        if word not in stopwords_list:
            word = token.stem(word)
            all_words.append(word)
    return all_words

In [14]:
# 下载 punkt 资源
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86180\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
all_emails = [word_split(data) for data in raw_emails]

In [16]:
print(all_emails[1])  # 查看分词结果

['hi', 'updat', 'gulus', 'check', 'mirror', 'seem', 'littl', 'typo', 'debian', 'readm', 'file', 'exampl', 'http', 'gulus', 'usherbrook', 'ca', 'debian', 'readm', 'ftp', 'ftp', 'fr', 'debian', 'org', 'debian', 'readm', 'test', 'lenni', 'access', 'releas', 'dist', 'test', 'current', 'test', 'develop', 'snapshot', 'name', 'etch', 'packag', 'test', 'unstabl', 'pass', 'autom', 'test', 'propog', 'releas', 'etch', 'replac', 'lenni', 'like', 'readm', 'html', 'yan', 'morin', 'consult', 'en', 'logiciel', 'libr', 'yan', 'morin', 'savoirfairelinux', 'com', 'number', 'number', 'number', 'unsubscrib', 'email', 'debian', 'mirror', 'request', 'list', 'debian', 'org', 'subject', 'unsubscrib', 'troubl', 'contact', 'listmast', 'list', 'debian', 'org']


In [17]:
# 特征提取
# 创建一个dataframe，列名为text和label
trainDF = pandas.DataFrame()
trainDF['text'] = all_emails
trainDF['label'] = labels

In [18]:
# 将数据集分为训练集和测试集，以便模型能在训练集上学习并在测试集上验证其性能
# sklearn.model_selection.train_test_split
train_data, test_data, train_label, test_label = train_test_split(trainDF['text'],trainDF['label'], random_state=0)

In [19]:
# label编码为目标变量,即从字符串转为一个数字
# sklearn.preprocessing
encoder = preprocessing.LabelEncoder()
train_label = encoder.fit_transform(train_label)
test_label = encoder.fit_transform(test_label)

In [20]:
trainDF['text'] = [' '.join(email) for email in all_emails]
train_data = [' '.join(doc) for doc in train_data]
test_data = [' '.join(doc) for doc in test_data]

In [21]:
# 4.1 计数特征向量
# sklearn.feature_extraction.text.CountVectorizer
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
count_vect.fit(trainDF['text'])
xtrain_count = count_vect.transform(train_data)  # 训练集特征向量
xtest_count = count_vect.transform(test_data)  # 测试集特征向量

In [23]:
model = naive_bayes.MultinomialNB()
model.fit(xtrain_count, train_label)
# 使用测试集的特征向量进行预测
predictions = model.predict(xtest_count)
# 使用测试集的标签来计算评估指标
accuracy = metrics.accuracy_score(test_label, predictions)
precision = metrics.precision_score(test_label, predictions, average='weighted')
recall = metrics.recall_score(test_label, predictions, average='weighted')
f1_score = metrics.f1_score(test_label, predictions, average='weighted')
print("NB, Count Vectors: ", accuracy, precision, recall, f1_score)
# 保存模型
joblib.dump(model, 'NB_model.pkl')
# 保存向量化器
joblib.dump(count_vect, 'NB_vectorizer.pkl')

NB, Count Vectors:  0.98 0.9818181818181817 0.98 0.9803496081977094


### 模型测试

In [31]:
import os
import joblib
# 加载模型和向量化器
model = joblib.load('BernliNB_model.pkl')
vectorizer = joblib.load('BernliNB_vectorizer.pkl')

In [32]:
# 遍历文件夹中的邮件
emails, predictions = [], []
for root, dirs, files in os.walk('trec07p/data'):
    for file in files:
        with open(os.path.join(root, file), 'r', encoding='utf-8', errors='ignore') as f:
            email_content = f.read()
        emails.append(email_content)

In [56]:
# 特征提取
email_features = vectorizer.transform(emails)
# 模型预测
# predictions = model.predict(email_features)
import numpy as np

# 获取模型预测的概率
probabilities = model.predict_proba(email_features)

# 设置新的阈值
threshold = 0.58
predictions = (probabilities[:, 1] >= threshold).astype(int)

spam_count = sum(predictions)
ham_count = len(predictions) - spam_count

spam_ratio = spam_count / len(predictions)
ham_ratio = ham_count / len(predictions)

print(f'垃圾邮件占比: {spam_ratio:.2%}')
print(f'非垃圾邮件占比: {ham_ratio:.2%}')

垃圾邮件占比: 53.02%
非垃圾邮件占比: 46.98%


In [33]:
# 特征提取
email_features = vectorizer.transform(emails)
# 模型预测
predictions = model.predict(email_features)
# 计算垃圾邮件和非垃圾邮件的数量
spam_count = sum(predictions)
ham_count = len(predictions) - spam_count

# 计算占比
spam_ratio = spam_count / len(predictions)
ham_ratio = ham_count / len(predictions)

print(f'垃圾邮件占比: {spam_ratio:.2%}')
print(f'非垃圾邮件占比: {ham_ratio:.2%}')

NotFittedError: This BernoulliNB instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [5]:
index_path = 'trec07p/full/index'
# 初始化计数器
spam_count = 0
ham_count = 0

with open(index_path, 'r') as file:
    for line in file:
        if 'spam' in line:
            spam_count += 1
        elif 'ham' in line:
            ham_count += 1

# 计算总邮件数量
total_emails = spam_count + ham_count

# 计算占比
spam_percentage = (spam_count / total_emails) * 100
ham_percentage = (ham_count / total_emails) * 100

# 输出结果
print(f"垃圾邮件占比: {spam_percentage:.2f}%")
print(f"非垃圾邮件占比: {ham_percentage:.2f}%")

垃圾邮件占比: 66.56%
非垃圾邮件占比: 33.44%
