$$P(S|M) \propto arg\underset{S}{\operatorname{max}} P(S) \prod_{i=1}^{n} P(w_i | S), w_i \in M$$
где $$P(S|M) \text{ - вероятность класса } S \in \{Спам, НеСпам\} \text{ относительно признака,}$$
$$ P(S) \text{ - вероятность класса } S,$$
$$ P(w_i | S) \text{ - вероятность } w_i \text{ признака от класса } S, $$
$$ M \text{- сообщение.} $$

# Импортирование библиотек и данных

In [18]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

In [2]:
data = pd.read_csv('/content/drive/MyDrive/data/spam.csv')
data = data.dropna()
data

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0
...,...,...
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1


# Обработка данных


## Количество объектов разных классов

In [3]:
data['label'].value_counts()

label
0    2500
1     499
Name: count, dtype: int64

## Анализ на дубликаты

In [4]:
unique_rows_count = len(data.drop_duplicates())
unique_rows_count

2872

В наборе сообщений присутствуют неуникальные экземпляры. Их лучше удалить, для того чтобы, они не вызывали переобучения.

In [5]:
data = data.drop_duplicates()
data.shape

(2872, 2)

## Нормирование данных

In [6]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['email'])
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
X

Unnamed: 0,__,___,____,_____,______,_______,________,_________,__________,______________,...,허락없이,헤어디자이너,현재,호황을,홈쇼핑의,확실한,활황을,훨씬,힘입어,ｉt的技
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Пополнение данных

Как было показано выше, классы несбалансированны в соотношении 5:1. Для устранения данной проблемы необходимо поплнение данных.

In [7]:
y = data['label']

In [8]:
sm = SMOTE()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=37)

In [9]:
columns = X_train.columns

X_tr, y_tr = sm.fit_resample(X_train, y_train)

In [10]:
X_new_train = pd.DataFrame(data=X_tr, columns=columns)
y_new_train = y_tr


print("Соотношение классов:", y_new_train.value_counts())
print("Форма данных, целевой признак:", y_new_train.shape, "матрица признаков:", X_new_train.shape)

Соотношение классов: label
0    1848
1    1848
Name: count, dtype: int64
Форма данных, целевой признак: (3696,) матрица признаков: (3696, 34116)


Теперь данные нормированны, классы сбалансированны и дубликаты устранены.

# Реализация Наивного Байесовского классификатора

In [24]:
class NaïveBayesClassifier2:


  def _get_freequency_matrix(self, X_train_, y_train_, num_of_classes):
      return np.array([np.sum(X_train_[y_train_ == c], axis=0) for c in range(num_of_classes)])


  def _get_likelihood_matrix(self, X, y, num_of_classes) -> np.ndarray:
      frequency_matrix = self._get_freequency_matrix(X, y, num_of_classes)
      frequency_matrix += 1
      likelihood_matrix = ((frequency_matrix) / np.sum((frequency_matrix), axis=1)[:, np.newaxis])

      return np.log(likelihood_matrix)


  def fit(self, X: np.ndarray, y: np.ndarray, alpha:int=1):
      alpha = alpha

      X_train = X
      y_train = y

      unique_classes, classes_counts = np.unique(y, return_counts=True)
      num_of_classes = len(unique_classes)
      objects_count = len(y)

      self.classes_priors = classes_counts / objects_count

      self.likelihood_matrix = self._get_likelihood_matrix(X, y, num_of_classes)


  def _predict_single(self, _X_test_email: np.ndarray):
    # 1. make posterior probability for elements w1*p(w1|S) + w2*p(w2|S) + ... wn*p(wn|S)
    # 2. multiply it by different prior probs
    # 3. take index of max posterior prob
    class_prob_sums = []
    for class_likelihood in self.likelihood_matrix:
      product = _X_test_email * class_likelihood
      class_prob_sums.append(product.sum())

    log_classes_priors = np.log(self.classes_priors)

    posteriors = log_classes_priors + class_prob_sums

    return np.argmax(posteriors)


  def predict(self, _X_test: np.ndarray):

    classes_predicted = []

    for x in _X_test:
      classes_predicted.append(self._predict_single(x))

    return classes_predicted

In [25]:
nbc = NaïveBayesClassifier2()
nbc.fit(X_new_train.values, y_new_train.values)
predictions = nbc.predict(X_test.values)

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.9888579387186629
Precision: 0.9747899159663865
Recall: 0.9586776859504132
F1-Score: 0.9666666666666667


# NBA из sklearn

In [26]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_new_train, y_new_train)

y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Accuracy: 0.9888579387186629
Precision: 0.9747899159663865
Recall: 0.9586776859504132
F1-Score: 0.9666666666666667
