In [3]:
import os
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# 定义一个函数，对文件中的内容进行预处理，比如删除一些值
def clear_content(content):
    # 只保留英文字符
    filtered_content = re.sub(r'[^a-zA-Z\s]', '', content)
    filtered_content = filtered_content.lower()
    # 根据换行，将其分成一个一个列表  或者把其中换行 制表符 改为空格
    filtered_content = filtered_content.replace("\n"," ")
    filtered_content = filtered_content.replace("\t"," ")
    # 切分成单词
    filtered_content_list = filtered_content.split(" ")
    filtered_content_without_stopwords = [word for word in filtered_content_list if word not in list(ENGLISH_STOP_WORDS)]
    filtered_content_without_stopwords = [word for word in filtered_content_without_stopwords if word.strip() != ""]
    return filtered_content_without_stopwords


# 定义一个函数，对输入的文件夹的文件进行遍历
def preprocess(folderpath):
    folderpath = folderpath

    email_list = []
    for filename in os.listdir(folderpath):
        content = ""
        file_path = os.path.join(folderpath,filename)
        with open(file_path,mode="r",encoding="gbk") as f:
            content = f.read()
        content = clear_content(content)
        
        email_list.append(content)
    return email_list



In [4]:
ham_email_list = preprocess("data/ham")
spam_email_list = preprocess("data/spam")
print(spam_email_list)

[['codeine', 'mg', 'visa', 'codeine', 'methylmorphine', 'narcotic', 'opioid', 'pain', 'reliever', 'mg', 'mg', 'pills', 'mg', 'mg', 'mg', 'visa'], ['ordercializviagra', 'online', 'save', 'nline', 'pharmacy', 'noprescription', 'required', 'buy', 'canadian', 'drugs', 'wholesale', 'prices', 'save', 'fdaapproved', 'drugs', 'superb', 'quality', 'drugs', 'accept', 'major', 'credit', 'cards'], ['gain', 'incredibe', 'gains', 'length', 'inches', 'yourpenis', 'permanantly', 'amazing', 'increase', 'thickness', 'yourpenis', 'betterejacuation', 'control', 'experience', 'rockharderecetions', 'explosive', 'intenseorgasns', 'increase', 'volume', 'ofejacuate', 'doctor', 'designed', 'endorsed', 'herbal', 'natural', 'safe', 'proven', 'naturalpenisenhancement', 'works', 'moneyback', 'guaranteeed'], ['buy', 'ambiem', 'zolpidem', 'mgmg', 'pill', 'pills', 'x', 'mg', 'pills', 'x', 'mg', 'pills', 'x', 'mg', 'pills', 'x', 'mg', 'pills', 'x', 'mg'], ['ordercializviagra', 'online', 'save', 'nline', 'pharmacy', 'no

In [5]:
def get_ham_dic(ham_email_list,spam_email_list):
    word_set = set()

    # 记录所有种类的单词，正常邮件和垃圾邮件种类的单词

    for email in ham_email_list:
        for word in email:
            word_set.add(word)
    for email in spam_email_list:
        for word in email:
            word_set.add(word)
    # 计算每个词在正常邮件出现的次数

    word_dict = {}

    for word in word_set:
        word_dict[word] = 0

        for email in ham_email_list:
            for word1 in email:
                if (word==word1):
                    
                    word_dict[word]+=1
                    break
    return word_dict

ham_w_dict = get_ham_dic(ham_email_list,spam_email_list)
print(ham_w_dict)



{'pharmacy': 0, 'announcement': 1, 'thing': 1, 'tickets': 1, 'reply': 2, 'stepp': 1, 'trip': 1, 'experts': 0, 'arolexbvlgari': 0, 'save': 0, 'expertise': 1, 'pills': 0, 'ferguson': 1, 'gpu': 1, 'design': 1, 'warranty': 0, 'thousand': 0, 'financial': 0, 'john': 1, 'microsoft': 0, 'httpwwwborderscomonlinestorestoredetailview': 1, 'mba': 1, 'approach': 1, 'china': 1, 'died': 1, 'analgesic': 0, 'famous': 0, 'httpdocsgooglecomsupportbinanswerpyhlenanswer': 1, 'regards': 1, 'station': 1, 'ups': 0, 'possible': 1, 'u': 1, 'fbi': 1, 'withoutprescription': 0, 'endorsed': 0, 'mathematician': 1, 'bad': 1, 'serial': 1, 'supporting': 1, 'programming': 1, 'm': 1, 'just': 2, 'assigning': 1, 'vuitton': 0, 'jqplot': 1, 'httpwwwgooglecomsupportsitesbinanswerpyhlenanswer': 1, 'jpgs': 1, 'ap': 0, 'mgmg': 0, 'blue': 1, 'strategy': 1, 'inspired': 1, 'art': 1, 'works': 0, 'x': 1, 'functionalities': 1, 'code': 1, 'guy': 1, 'extended': 1, 'uses': 1, 'supplement': 0, 'pain': 0, 'jewerly': 0, 'derivatives': 1, 't

In [6]:
def get_spam_dic(ham_email_list,spam_email_list):
    all_words = []
    word_set = set()
    # 记录所有种类的单词，正常邮件和垃圾邮件种类的单词
    for email in ham_email_list:
        for word in email:
            word_set.add(word)
    for email in spam_email_list:
        for word in email:
            word_set.add(word)
    
    # 计算每个词在垃圾邮件出现的次数

    word_dict = {}

    for word in word_set:
        word_dict[word] = 0

        for email in spam_email_list:
            for word1 in email:
                if (word==word1):
                    
                    word_dict[word]+=1
                    break
    return word_dict

spam_w_dict = get_spam_dic(ham_email_list,spam_email_list)
print(spam_w_dict)



{'pharmacy': 3, 'announcement': 0, 'thing': 0, 'tickets': 0, 'reply': 0, 'stepp': 0, 'trip': 0, 'experts': 1, 'arolexbvlgari': 3, 'save': 4, 'expertise': 0, 'pills': 4, 'ferguson': 0, 'gpu': 0, 'design': 0, 'warranty': 3, 'thousand': 1, 'financial': 1, 'john': 0, 'microsoft': 1, 'httpwwwborderscomonlinestorestoredetailview': 0, 'mba': 0, 'approach': 0, 'china': 0, 'died': 0, 'analgesic': 1, 'famous': 3, 'httpdocsgooglecomsupportbinanswerpyhlenanswer': 0, 'regards': 0, 'station': 0, 'ups': 3, 'possible': 0, 'u': 0, 'fbi': 0, 'withoutprescription': 1, 'endorsed': 5, 'mathematician': 0, 'bad': 0, 'serial': 0, 'supporting': 0, 'programming': 0, 'm': 0, 'just': 0, 'assigning': 0, 'vuitton': 3, 'jqplot': 0, 'httpwwwgooglecomsupportsitesbinanswerpyhlenanswer': 0, 'jpgs': 0, 'ap': 3, 'mgmg': 1, 'blue': 0, 'strategy': 0, 'inspired': 0, 'art': 0, 'works': 2, 'x': 2, 'functionalities': 0, 'code': 0, 'guy': 0, 'extended': 1, 'uses': 0, 'supplement': 1, 'pain': 2, 'jewerly': 3, 'derivatives': 0, 't

In [7]:
# 计算在正常邮件中出现的概率
def get_ham_rate(filename,ham_w_dict):
    with open(filename,mode="r") as f:
        content = f.read()
        content = clear_content(content)
    test_set = set()
    for word in content:
        test_set.add(word)
    
    ham_email_num = len(os.listdir(f"data/ham"))
    # 记录每个词的数目
    ham_num = []
    for x in test_set:
        for w in ham_w_dict:
            if x==w:
                ham_num.append(ham_w_dict[w])
    
    # 拉普拉斯平滑
    laplasi = 1
    # 这里采用了加法，因为乘法会过小，相当于用到了log，后面会有体现
    for num in ham_num:
        laplasi += num
    ham_rate = laplasi/(ham_email_num+2)
    return ham_rate
 




In [8]:
# 计算在正常邮件中出现的概率
def get_spam_rate(filename,spam_w_dict):
    with open(filename,mode="r") as f:
        content = f.read()
        content = clear_content(content)
    test_set = set()
    for word in content:
        test_set.add(word)
    
    spam_email_num = len(os.listdir(f"data/spam"))
    # 记录每个词的数目
    spam_num = []
    for x in test_set:
        for w in spam_w_dict:
            if x==w:
                spam_num.append(spam_w_dict[w])
    
    # 拉普拉斯平滑
    laplasi = 1
    # 这里采用了加法，因为乘法会过小，相当于用到了log，后面会有体现
    for num in spam_num:
        laplasi += num
    spam_rate = laplasi/(spam_email_num+2)
    return spam_rate
 


In [None]:
def email_divide(folderpath):

    for filename in os.listdir(folderpath):
        file_path = os.path.join(folderpath,filename)
        print(f"{file_path}")
        ham = get_ham_rate(file_path,ham_w_dict)+ np.log(1 / 2)
        spam = get_spam_rate(file_path,spam_w_dict)+ np.log(1 / 2)
         if spam > ham:
            print('p1>p2，所以是垃圾邮件.')
        else:
            print('p1<p2，所以是正常邮件.')
