In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

1. Считывание данных и удаление пустых значений

In [2]:
df = pd.read_csv('data/eml_dataset.csv', sep=';')
df = df.dropna(subset=['Text', 'Mark'])

2. Векторизация данных

In [3]:
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['Text'])
y = df['Mark'].map({'spam': 1, 'not spam': 0})

3. Распределение на обучающую и тестовую выборки

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20)

4. Функция для обучения моделей

In [None]:
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model_name} Performance:\n{classification_report(y_test, y_pred)}\n")

a) MLPClassifier

In [6]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
train_and_evaluate(mlp, 'Neural Network (MLP)')

Neural Network (MLP) Performance:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       140
           1       0.97      0.98      0.97       256

    accuracy                           0.97       396
   macro avg       0.97      0.96      0.96       396
weighted avg       0.97      0.97      0.97       396




b) Logistic Regression

In [7]:
lr = LogisticRegression()
train_and_evaluate(lr, 'Logistic Regression')

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       140
           1       0.97      0.98      0.97       256

    accuracy                           0.96       396
   macro avg       0.96      0.96      0.96       396
weighted avg       0.96      0.96      0.96       396




c) Support Vector Machine (SVM)

In [8]:
svm = SVC()
train_and_evaluate(svm, 'SVM')

SVM Performance:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       140
           1       0.99      0.96      0.98       256

    accuracy                           0.97       396
   macro avg       0.96      0.97      0.97       396
weighted avg       0.97      0.97      0.97       396




d) Bernoulli Naive Bayes

In [9]:
bnb = BernoulliNB()
train_and_evaluate(bnb, 'Bernoulli Naive Bayes')

Bernoulli Naive Bayes Performance:
              precision    recall  f1-score   support

           0       0.82      0.99      0.90       140
           1       0.99      0.88      0.93       256

    accuracy                           0.92       396
   macro avg       0.91      0.93      0.91       396
weighted avg       0.93      0.92      0.92       396


