# Импортирование библиотек и данных

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
data = pd.read_csv('/content/drive/MyDrive/data/spam.csv')
data = data.dropna()
data

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


In [3]:
data['label'].value_counts()

label
0    2500
1     499
Name: count, dtype: int64

# Нормирование данных

In [4]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['email'])
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
X

Unnamed: 0,__,___,____,_____,______,_______,________,_________,__________,______________,...,허락없이,헤어디자이너,현재,호황을,홈쇼핑의,확실한,활황을,훨씬,힘입어,ｉt的技
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
unique_rows_count = len(data.drop_duplicates())
unique_rows_count

2872

In [6]:
counts = data['label'].value_counts()
SPAM_COUNT = counts[1]
EMAILS = data.shape[0]
P_SPAM = SPAM_COUNT / EMAILS

# Составление табиц частот и превдоподобия

In [7]:
def P_x(x):
  return np.sum(np.all(X.values == x, axis=1)) / EMAILS

In [8]:
def P_x_spam(x):
  return np.sum(np.all(X.values == x, axis=1) & (data['label'].values == 1)) / SPAM_COUNT

In [9]:
def NBA(x):
  # p(spam | x) = p(x | spam) * p(spam) / p(x)
  return P_x_spam(x) * P_SPAM / P_x(x)

In [12]:
NBA(X.iloc[2751].values)

1.0

# Провеверка

In [13]:
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 1.0
Recall: 0.97
F1 Score: 0.9847715736040609
