In [1]:
from downloader import FileDownloader

In [2]:
DOWNLOAD_HAM = ["https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2",
                "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2",
                "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"]

DOWNLOAD_SPAM = ["https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2",
                 "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"]

In [3]:
# NB!!! This uses async calls, if encounter "RuntimeError: Session is closed" might need to restart the kernel
downloader = FileDownloader()

# NB!!! pycharm might give a warning about the await call, but it's fine
await downloader.download_from_urls(DOWNLOAD_HAM + DOWNLOAD_SPAM)

Downloading: 20030228_easy_ham.tar.bz2 from https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2
Downloading: 20030228_easy_ham_2.tar.bz2 from https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2
Downloading: 20030228_hard_ham.tar.bz2 from https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2
Downloading: 20050311_spam_2.tar.bz2 from https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2
Downloading: 20030228_spam.tar.bz2 from https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2
Downloaded:  20030228_hard_ham.tar.bz2
Downloaded: 20030228_hard_ham.tar.bz2
Extracting: 20030228_hard_ham.tar.bz2
Downloaded:  20030228_easy_ham_2.tar.bz2
Downloaded: 20030228_easy_ham_2.tar.bz2
Extracting: 20030228_easy_ham_2.tar.bz2
Downloaded:  20030228_spam.tar.bz2
Downloaded: 20030228_spam.tar.bz2
Extracting: 20030228_spam.tar.bz2
Downloaded:  20050311_spam_2.tar.bz2
Downloaded: 20050311_spam_2.tar.bz2
E

In [4]:
import os
import pandas as pd

# Get the current working directory
root_dir = os.getcwd()

# Create an empty DataFrame
data = pd.DataFrame(columns=['content', 'status'])

# Iterate over each folder in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    # Check if it is indeed a folder
    if os.path.isdir(folder_path):
        # Determine if the folder is "ham" or "spam" based on the folder name
        if 'ham' in folder_name:
            status = 'ham'
        elif 'spam' in folder_name:
            status = 'spam'
        else:
            continue  # Skip folders that do not match "ham" or "spam"
        
        # Iterate over each file in the folder
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            # Ensure it's a file before trying to read it
            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    content = file.read()
                    # Create a DataFrame for the new row
                    new_row = pd.DataFrame({'content': [content], 'status': [status]})
                    # Concatenate the new row to the main DataFrame
                    data = pd.concat([data, new_row], ignore_index=True)

# Display the DataFrame
print(data)


                                                content status
0     From exmh-workers-admin@redhat.com  Thu Aug 22...    ham
1     From Steve_Burt@cursor-system.com  Thu Aug 22 ...    ham
2     From timc@2ubh.com  Thu Aug 22 13:52:59 2002\n...    ham
3     From irregulars-admin@tb.tf  Thu Aug 22 14:23:...    ham
4     From Stewart.Smith@ee.ed.ac.uk  Thu Aug 22 14:...    ham
...                                                 ...    ...
6046  From tba@insiq.us  Wed Dec  4 11:46:34 2002\nR...   spam
6047  Return-Path: <raye@yahoo.lv>\nReceived: from u...   spam
6048  From cweqx@dialix.oz.au  Tue Aug  6 11:03:54 2...   spam
6049  From ilug-admin@linux.ie  Wed Dec  4 11:52:36 ...   spam
6050  mv 00001.317e78fa8ee2f54cd4890fdc09ba8176 0000...   spam

[6051 rows x 2 columns]


In [5]:
from textprocessor import TextPreprocessor

In [6]:
text_processor = TextPreprocessor()
data['processed_content'] = data['content'].apply(text_processor.preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilyar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ilyar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ilyar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ilyar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
data['processed_content'][0]

'thu aug zzzz received localhost localhost postfix esmtp id zzzz localhost thu aug edt received phobos localhost imap zzzz localhost thu aug ist received esmtp id thu aug received postfix esmtp id thu aug edt received postfix esmtp id thu aug edt received mail localhost id thu aug received smtp id thu aug received smtp id thu aug received esmtp id thu aug ict received localhost esmtp id thu aug ict robert elz kre chris garrigues cc subject new sequence window reference sender precedence bulk mailto mailto http mailto discussion list exmh developer http mailto http date thu aug date wed aug chris garrigues ca reproduce error repeatable like every time without fail debug log pick happening exec pick ftp mercury exec pick ftp mercury hit marking hit tkerror syntax error expression int note run pick command hand delta pick ftp mercury hit hit come obviously version nmh using delta pick pick compiled sun mar ict relevant part delta mhparam pick sel since pick command work sequence actually 

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


In [9]:
X_train, X_test, y_train, y_test = train_test_split(data['processed_content'], data['status'], test_size=0.2,
                                                    random_state=42)

In [10]:
# Преобразование текстовых данных в векторы TF-IDF
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Обучение модели логистической регрессии
clf = LogisticRegression(max_iter=100)
clf.fit(X_train_tfidf, y_train)

# Преобразование тестовых данных в векторы TF-IDF
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Предсказание меток для тестовых данных
y_pred = clf.predict(X_test_tfidf)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.98
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       863
        spam       0.98      0.94      0.96       348

    accuracy                           0.98      1211
   macro avg       0.98      0.97      0.97      1211
weighted avg       0.98      0.98      0.98      1211

[[858   5]
 [ 22 326]]
