# 垃圾邮件识别
文本处理：
1. 词法分析：将文本切分为单词
2. 词向量化：将单词映射为向量
3. 模型训练：使用向量训练模型
4. 模型预测：使用模型对未知文本进行预测

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics import accuracy_score
from sklearn import naive_bayes as nb
from sklearn.model_selection import train_test_split
import scipy
from scipy import io

In [52]:
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    vocabList = list(vocabSet)
    vocabList = sorted(vocabList)
    return vocabList

测试

In [53]:
dataSet = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
           ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
           ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
           ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
           ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
           ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
vocabList = createVocabList(dataSet)
print(vocabList)

['I', 'ate', 'buying', 'cute', 'dalmation', 'dog', 'flea', 'food', 'garbage', 'has', 'help', 'him', 'how', 'is', 'licks', 'love', 'maybe', 'mr', 'my', 'not', 'park', 'please', 'posting', 'problems', 'quit', 'so', 'steak', 'stop', 'stupid', 'take', 'to', 'worthless']


词集模型
1. 创建一个长度为单词数量，值为0的向量
2. 遍历单词列表，将单词列表中的单词作为索引，将向量的对应位置的值设为1
3. 返回向量

In [54]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [55]:
print(setOfWords2Vec(vocabList, dataSet[0]))

[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]


# 词袋模型
1. 创建一个长度为单词数量，值为0的向量
2.


In [56]:
def bag0forWords2Vec(vocabList, inputSet):  # 词袋模型函数
    """
    将输入文本转换为词袋向量表示。

    参数:
    vocabList (list): 已构建的词汇表列表
    inputSet (set or list): 输入的文本内容（分词后的单词集合或列表）

    返回:
    list: 与词汇表长度一致的向量，表示输入文本中每个词的出现次数
    """
    # 初始化一个全为0的向量，长度等于词汇表长度
    returnVec = [0] * len(vocabList)

    # 遍历输入文本中的每个单词
    for word in inputSet:
        # 如果单词在词汇表中存在
        if word in vocabList:
            # 在对应位置增加计数（这里是简单+1，即词袋模型）
            returnVec[vocabList.index(word)] += 1
        else:
            # 如果单词不在词汇表中，打印警告信息
            print("the word: %s is not in my Vocabulary!" % word)

    # 返回最终的词袋向量表示
    return returnVec


In [57]:
print(vocabList)
print(bag0forWords2Vec(vocabList, dataSet[0]))

['I', 'ate', 'buying', 'cute', 'dalmation', 'dog', 'flea', 'food', 'garbage', 'has', 'help', 'him', 'how', 'is', 'licks', 'love', 'maybe', 'mr', 'my', 'not', 'park', 'please', 'posting', 'problems', 'quit', 'so', 'steak', 'stop', 'stupid', 'take', 'to', 'worthless']
[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]


In [1]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]
textParse('I love You')

[]

In [59]:
def loaddata():
    docList = []
    classList = []

    num = 26
    for i in range(1, num):
        wordList = textParse(open('data/email/ham/%d.txt' % i, encoding='latin-1').read())

        docList.append(wordList)
        classList.append(1)

        wordList = textParse(open('data/email/spam/%d.txt' % i, encoding='latin-1').read())

        docList.append(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)

    X = []
    for docIndex in range(len(docList)):
        X.append(bag0forWords2Vec(vocabList, docList[docIndex]))
        # X.append(setOfWords2Vec(vocabList, docList[docIndex]))

    return X, classList, vocabList

In [60]:
X,y,vocaList = loaddata()
print(len(X), len(y))
print(len(vocaList))

50 50
0


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = nb.MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))

ValueError: Found array with 0 feature(s) (shape=(40, 0)) while a minimum of 1 is required by MultinomialNB.