In [11]:
# 朴素贝叶斯分类器：处理邮件过滤

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import numpy as np

# 创建一些模拟的邮件数据
data = [
    "Hello, how are you?",
    "Hi there, I hope you are doing well",
    "Your email has been selected for a $1000 prize",
    "Congratulations, you have won a lottery",
    "Dear user, we are sending you this email to inform you about your prize",
    "Please contact us to claim your $1000 prize",
    "I am sure you will enjoy this email",
    "Contact us immediately to get your prize",
    "Kind regards, [Company]",
    "Thank you for your email",
    "I am glad to hear that",
    "Please find attached the document you requested"
]

# 创建对应的标签，1代表垃圾邮件，0代表非垃圾邮件
# 假设前5封邮件是垃圾邮件
labels = np.array([0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1])
print(f"data shape:{len(data)}")
print(f"labels shape:{len(labels)}")
# 使用词袋模型将文本数据转换为向量
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(data)
print(X_vectorized)

# 创建朴素贝叶斯分类器实例
clf = MultinomialNB()

# 训练分类器
clf.fit(X_vectorized, labels)

# 手动创建测试数据
X_test = vectorizer.transform([
    "Hello, congratulations on your prize",
    "I am glad to offer you this special deal"
])

# 预测测试集结果
y_pred = clf.predict(X_test)

# 打印预测结果
print("预测的邮件类别（1为垃圾邮件，0为非垃圾邮件）:")
print(y_pred)

data shape:12
labels shape:12
  (0, 22)	1
  (0, 25)	1
  (0, 3)	1
  (0, 49)	1
  (1, 3)	1
  (1, 49)	1
  (1, 23)	1
  (1, 40)	1
  (1, 24)	1
  (1, 12)	1
  (1, 46)	1
  (2, 50)	1
  (2, 13)	1
  (2, 19)	1
  (2, 5)	1
  (2, 34)	1
  (2, 16)	1
  (2, 0)	1
  (2, 31)	1
  (3, 49)	1
  (3, 8)	1
  (3, 20)	1
  (3, 48)	1
  (3, 29)	1
  (4, 3)	1
  :	:
  (7, 42)	1
  (7, 9)	1
  (7, 43)	1
  (7, 26)	1
  (7, 17)	1
  (8, 28)	1
  (8, 32)	1
  (8, 7)	1
  (9, 49)	1
  (9, 50)	1
  (9, 13)	1
  (9, 16)	1
  (9, 37)	1
  (10, 42)	1
  (10, 2)	1
  (10, 18)	1
  (10, 21)	1
  (10, 38)	1
  (11, 49)	1
  (11, 30)	1
  (11, 15)	1
  (11, 4)	1
  (11, 39)	1
  (11, 11)	1
  (11, 33)	1
预测的邮件类别（1为垃圾邮件，0为非垃圾邮件）:
[1 0]
