In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

1. Считывание данных и удаление пустых значений

In [5]:
df = pd.read_csv('data/eml_dataset.csv', sep=';')
df = df.dropna(subset=['Text', 'Mark'])

2. Векторизация данных

In [6]:
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['Text'])
y = df['Mark'].map({'spam': 1, 'not spam': 0})

3. Распределение данных на обучающую и тестовую выборку

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [8]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, test_size=0.20, random_state=23)

3. Функция для обучения моделей

In [9]:
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model_name} performance:\n{classification_report(y_test, y_pred)}\n")

In [10]:
def train_and_evaluate2(model, model_name):
    model.fit(X2_train, y2_train)
    y_pred = model.predict(X2_test)
    print(f"{model_name} performance:\n{classification_report(y2_test, y_pred)}\n")

4. Настройка гиперпараметров

In [11]:
MLP_grid = {'hidden_layer_sizes': [(50, 50), (50, 100)],
            'solver': ['adam', 'lbfgs'],
            'alpha': [0.001, 0.0001],
            'learning_rate': ['constant', 'adaptive']
            }

In [None]:
LR_grid = {'solver': ['sag', 'saga', 'lbfgs'],
           'penalty': ['l2', None],
           'warm_start': [True, False]
           }

In [None]:
SVC_grid = {'C': [0.001, 0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['rbf', 'linear', 'poly'],
            'decision_function_shape': ['ovo', 'ovr']
            }

In [19]:
NB_grid = {'alpha': [0.5, 1, 2, 3, 5, 10],
           'force_alpha': [True, False],
           'fit_prior': [True, False],
           'binarize': [1, 0.1, 0.01, 0.001, 0.0001],
           } 

5. Анализ обучения моделей с выборкой 75%/25% без настройки гиперпараметров

a) MLPClassifier

In [73]:
mlp = MLPClassifier()
train_and_evaluate(mlp, 'Neural Network (MLP)')

Neural Network (MLP) performance:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       144
           1       0.98      0.97      0.98       252

    accuracy                           0.97       396
   macro avg       0.96      0.97      0.97       396
weighted avg       0.97      0.97      0.97       396




b) Logistic Regression

In [78]:
lr = LogisticRegression()
train_and_evaluate(lr, 'Logistic Regression')

Logistic Regression performance:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93       144
           1       0.97      0.94      0.96       252

    accuracy                           0.94       396
   macro avg       0.94      0.95      0.94       396
weighted avg       0.95      0.94      0.94       396




c) Support Vector Machine (SVM)

In [79]:
svm = SVC()
train_and_evaluate(svm, 'SVM')

SVM performance:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       144
           1       0.99      0.91      0.95       252

    accuracy                           0.94       396
   macro avg       0.93      0.95      0.93       396
weighted avg       0.94      0.94      0.94       396




d) Bernoulli Naive Bayes

In [81]:
bnb = BernoulliNB()
train_and_evaluate(bnb, 'Bernoulli Naive Bayes')

Bernoulli Naive Bayes performance:
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       144
           1       1.00      0.87      0.93       252

    accuracy                           0.91       396
   macro avg       0.90      0.93      0.91       396
weighted avg       0.93      0.91      0.91       396




6. Анализ обучения моделей с выборкой 80%/20% без настройки гиперпараметров

a) MLPClassifier

In [15]:
mlp = MLPClassifier()
train_and_evaluate2(mlp, 'Neural Network (MLP)')

Neural Network (MLP) performance:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       119
           1       0.98      0.96      0.97       198

    accuracy                           0.97       317
   macro avg       0.96      0.97      0.97       317
weighted avg       0.97      0.97      0.97       317




b) Logistic Regression

In [16]:
lr = LogisticRegression()
train_and_evaluate2(lr, 'Logistic Regression')

Logistic Regression performance:
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       119
           1       0.97      0.94      0.96       198

    accuracy                           0.95       317
   macro avg       0.94      0.95      0.95       317
weighted avg       0.95      0.95      0.95       317




c) Support Vector Machine (SVM)

In [17]:
svm = SVC()
train_and_evaluate2(svm, 'SVM')

SVM performance:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       119
           1       0.99      0.92      0.95       198

    accuracy                           0.94       317
   macro avg       0.93      0.95      0.94       317
weighted avg       0.95      0.94      0.94       317




d) Bernoulli Naive Bayes

In [18]:
bnb = BernoulliNB()
train_and_evaluate2(bnb, 'Bernoulli Naive Bayes')

Bernoulli Naive Bayes performance:
              precision    recall  f1-score   support

           0       0.81      0.99      0.89       119
           1       0.99      0.86      0.92       198

    accuracy                           0.91       317
   macro avg       0.90      0.93      0.91       317
weighted avg       0.92      0.91      0.91       317




7. Анализ обучения моделей с выборкой 75%/25% с настройкой гиперпараметров

a) MLPClassifier

In [13]:
mlp = GridSearchCV(MLPClassifier(early_stopping=True), MLP_grid, return_train_score=True)
train_and_evaluate(mlp, 'Neural Network (MLP)')
print(f'Best "MLP" parametres:{mlp.best_params_}')

Neural Network (MLP) performance:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       144
           1       0.97      0.97      0.97       252

    accuracy                           0.96       396
   macro avg       0.96      0.96      0.96       396
weighted avg       0.96      0.96      0.96       396


Best "MLP" parametres:{'alpha': 0.001, 'hidden_layer_sizes': (50, 100), 'learning_rate': 'constant', 'solver': 'lbfgs'}


b) Logistic Regression

In [None]:
lr = GridSearchCV(LogisticRegression(max_iter=2000), LR_grid, return_train_score=True)
train_and_evaluate(lr, 'Logistic Regression')
print(f'Best "Linear Regression" parametres:{lr.best_params_}')

Logistic Regression performance:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       144
           1       0.98      0.98      0.98       252

    accuracy                           0.98       396
   macro avg       0.98      0.98      0.98       396
weighted avg       0.98      0.98      0.98       396


Best "Linear Regression parametres: /n{'penalty': None, 'solver': 'saga', 'warm_start': True}


c) Support Vector Machine (SVM)

In [93]:
svm = GridSearchCV(SVC(), SVC_grid, return_train_score=True)
train_and_evaluate(svm, 'SVM')
print(f'Best "Support Vector Machine" parametres:{svm.best_params_}')

SVM performance:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       144
           1       0.99      0.98      0.98       252

    accuracy                           0.98       396
   macro avg       0.98      0.98      0.98       396
weighted avg       0.98      0.98      0.98       396


Best "Support Vector Machine" parametres:{'C': 100, 'decision_function_shape': 'ovo', 'gamma': 0.1, 'kernel': 'rbf'}


d) Bernoulli Naive Bayes

In [94]:
bnb = GridSearchCV(BernoulliNB(), NB_grid, return_train_score=True)
train_and_evaluate(bnb, 'Bernoulli Naive Bayes')
print(f'Best "Bernaulli Naive Bayes" parametres:{bnb.best_params_}')

Bernoulli Naive Bayes performance:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93       144
           1       0.97      0.96      0.96       252

    accuracy                           0.95       396
   macro avg       0.95      0.95      0.95       396
weighted avg       0.95      0.95      0.95       396


Best "Bernaulli Naive Bayes" parametres:{'alpha': 0.5, 'binarize': 0.1, 'fit_prior': True, 'force_alpha': True}


8. Анализ обучения моделей с выборкой 80%/20% с настройкой гиперпараметров

a) MLPClassifier

In [14]:
mlp = GridSearchCV(MLPClassifier(), MLP_grid, return_train_score=True)
train_and_evaluate2(mlp, 'Neural Network (MLP)')
print(f'Best "MLP" parametres:{mlp.best_params_}')

Neural Network (MLP) performance:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       119
           1       0.98      0.97      0.98       198

    accuracy                           0.97       317
   macro avg       0.97      0.97      0.97       317
weighted avg       0.97      0.97      0.97       317


Best "MLP" parametres:{'alpha': 0.001, 'hidden_layer_sizes': (50, 100), 'learning_rate': 'constant', 'solver': 'adam'}


b) Logistic Regression

In [95]:
lr = GridSearchCV(LogisticRegression(max_iter=2000), LR_grid, return_train_score=True)
train_and_evaluate2(lr, 'Logistic Regression')
print(f'Best "Linear Regression" parametres:{lr.best_params_}')

Logistic Regression performance:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       119
           1       0.99      0.98      0.98       198

    accuracy                           0.98       317
   macro avg       0.98      0.98      0.98       317
weighted avg       0.98      0.98      0.98       317


Best "Linear Regression" parametres:{'penalty': None, 'solver': 'saga', 'warm_start': False}


c) Support Vector Machine (SVM)

In [96]:
svm = GridSearchCV(SVC(), SVC_grid, return_train_score=True)
train_and_evaluate2(svm, 'SVM')
print(f'Best "Support Vector Machine" parametres:{svm.best_params_}')

SVM performance:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       119
           1       0.99      0.98      0.98       198

    accuracy                           0.98       317
   macro avg       0.98      0.98      0.98       317
weighted avg       0.98      0.98      0.98       317


Best "Support Vector Machine" parametres:{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'linear'}


d) Bernoulli Naive Bayes

In [97]:
bnb = GridSearchCV(BernoulliNB(), NB_grid, return_train_score=True)
train_and_evaluate2(bnb, 'Bernoulli Naive Bayes')
print(f'Best "Bernaulli Naive Bayes" parametres:{bnb.best_params_}')

Bernoulli Naive Bayes performance:
              precision    recall  f1-score   support

           0       0.85      0.99      0.91       119
           1       0.99      0.89      0.94       198

    accuracy                           0.93       317
   macro avg       0.92      0.94      0.93       317
weighted avg       0.94      0.93      0.93       317


Best "Bernaulli Naive Bayes" parametres:{'alpha': 2, 'binarize': 0.001, 'fit_prior': False, 'force_alpha': True}
