Получение данных Enron Spam

In [None]:
import os
from os import walk

import pandas as pd

In [None]:
pathwalk = walk(r"ernon-spam/")

allHamData, allSpamData = [], []
for root, dr, file in pathwalk:
    if 'ham' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                allHamData.append(" ".join(ip.readlines()))

    elif 'spam' in str(file):
        for obj in file:
            with open(root + '/' + obj, encoding='latin1') as ip:
                allSpamData.append(" ".join(ip.readlines()))


allHamData = list(set(allHamData))
allSpamData = list(set(allSpamData))

hamPlusSpamData = allHamData + allSpamData
labels = [0]*len(allHamData) + [1]*len(allSpamData)

raw_enron = pd.DataFrame({"email": hamPlusSpamData, "label": labels})

Получение данных SpamAssassin

In [None]:
easy_ham_path = '/easy_ham/easy_ham/'
hard_ham_path = '/hard_ham/hard_ham/'
spam_path = '/spam_2/spam_2/'

def get_data(path):
    data = []
    files = os.listdir(path)
    for file in files:
        f = open(path+file, encoding = "ISO-8859-1")
        words_list = f.read()
        data.append(words_list)
        f.close()
    return data

easy_ham = get_data(easy_ham_path)
hard_ham = get_data(hard_ham_path)
ham = easy_ham + hard_ham
spam = get_data(spam_path)

all_data = ham + spam

labels_first = [0]*len(ham) + [1]*len(spam)

raw_assasin = pd.DataFrame({"email": all_data, "label": labels_first})

Объединение датасетов

In [None]:
res = pd.concat([raw_enron, raw_assasin], ignore_index=True)
res.to_csv('spam_dataset.csv', index=False)

Обработка датасета

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('spam_dataset.csv')

In [None]:
df = df.drop_duplicates(subset=['email'])

In [None]:
df.label.hist(bins=3)

In [None]:
X = df['email']
y = df['label']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

Получение векторов

In [None]:
!pip install simpletransformers

In [None]:
from simpletransformers.language_representation import RepresentationModel

In [None]:
train = X_train.to_numpy()
val = X_val.to_numpy()

In [None]:
model = RepresentationModel(
        model_type="bert",
        model_name="google/bert_uncased_L-12_H-768_A-12",
        use_cuda=True
    )

In [None]:
train_data = model.encode_sentences(train, combine_strategy="mean")
val_data = model.encode_sentences(val, combine_strategy="mean")

Модель ML

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=500,
    eval_metric =  'Precision',
    use_best_model = True,
    random_seed = 502
)

In [None]:
train_labels = y_train.to_numpy()
clf.fit(train_data, train_labels, eval_set=(val_data, y_val.to_numpy()))

In [None]:
preds_class = clf.predict(data=val_data)

In [None]:
from sklearn.metrics import classification_report

y_true = y_val.to_numpy()
y_pred = preds_class

print(classification_report(y_true, y_pred))

Результат


              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3712
           1       0.98      0.98      0.98      3223

    accuracy                           0.98      6935
   macro avg       0.98      0.98      0.98      6935
weighted avg       0.98      0.98      0.98      6935

Тестирование на данных из задачи

CSV файл был получен аналогично обработке для SpamAssassin

In [None]:
df_test = pd.read_csv('test_spam.csv')

In [None]:
df_test = df_test.drop_duplicates(subset=['email'])

In [None]:
df_test.label.hist(bins=3)

In [None]:
X_test = df['email']
y_test = df['label']
test = X_test.to_numpy()

In [None]:
test_data = model.encode_sentences(test, combine_strategy="mean")

In [None]:
preds_class = clf.predict(data=test_data)

In [None]:
from sklearn.metrics import classification_report

y_true = y_test.to_numpy()
y_pred = preds_class

print(classification_report(y_true, y_pred))

Результат


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18691
           1       1.00      1.00      1.00     15980

    accuracy                           1.00     34671
   macro avg       1.00      1.00      1.00     34671
weighted avg       1.00      1.00      1.00     34671


Сохранение модели

In [None]:
clf.save_model('catboost_model')