# Imports

In [9]:
import pandas as pd
import numpy as np
import os
import csv
import email
import re
# import stopwords
import nltk
from nltk.corpus import stopwords
import os
from email.message import EmailMessage

In [10]:
!pip install stopwords



In [11]:
nltk.download('wordnet')

nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gerba\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gerba\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Convert Plain Text Files To .eml

## Enron

### Ham

In [12]:
def convert_to_eml(root_folder, output_folder):
    for root, dirs, files in os.walk(root_folder):
        for filename in files:
            file_path = os.path.join(root, filename)

            # Process only files (without extension)
            if not os.path.splitext(filename)[1]:
                output_path = os.path.join(output_folder, f'{filename}.eml')

                try:
                    # Read raw text content with UTF-8 encoding
                    with open(file_path, 'r', encoding='utf-8') as file:
                        raw_message = file.read()
                except UnicodeDecodeError:
                    # Skip files not in UTF-8 encoding
                    continue

                # Create an EmailMessage object
                eml = EmailMessage()
                eml.set_content(raw_message)

                # Save as EML file
                with open(output_path, 'wb') as output_file:
                    output_file.write(eml.as_bytes())

# Usage example
root_folder = 'data\enron\ham'
output_folder = 'data\enron_eml_ham'

convert_to_eml(root_folder, output_folder)

### Spam

In [13]:
def convert_to_eml_spam(root_folder, output_folder):
    for root, dirs, files in os.walk(root_folder):
        for filename in files:
            file_path = os.path.join(root, filename)

            # Process only files with .txt extension
            if os.path.splitext(filename)[1] == '.txt':
                output_path = os.path.join(output_folder, f'{filename}.eml')

                try:
                    # Read raw text content with UTF-8 encoding
                    with open(file_path, 'r', encoding='utf-8') as file:
                        raw_message = file.read()
                except UnicodeDecodeError:
                    # Skip files not in UTF-8 encoding
                    continue

                # Create an EmailMessage object
                eml = EmailMessage()
                eml.set_content(raw_message)

                # Save as EML file
                with open(output_path, 'wb') as output_file:
                    output_file.write(eml.as_bytes())

# Usage example
root_folder = 'data\enron\spam'
output_folder = 'data\enron_eml_spam'

convert_to_eml_spam(root_folder, output_folder)


# Feature Extraction

## Enron

### Ham

In [14]:
def extract_email_info(root_folder, output_csv):
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Index', 'Message Body', 'Number of Words', 'Number of Stop Words', 'Number of Unique Words', 'Ratio of Lowercase to Uppercase', 'Number of Exclamation Points', 'Number of Unique Stemmed Words', 'Number of Lemmatized Words', 'Cleaned Body', 'Target'])

        index = 1

        for root, dirs, files in os.walk(root_folder):
            for filename in files:
                file_path = os.path.join(root, filename)

                # Process only files with .eml extension
                if os.path.splitext(filename)[1] == '.eml':
                    with open(file_path, 'rb') as file:
                        eml_data = file.read()

                    msg = email.message_from_bytes(eml_data)

                    message_body = ''
                    if msg.is_multipart():
                        for part in msg.walk():
                            content_type = part.get_content_type()
                            if content_type == 'text/plain':
                                message_body = part.get_payload(decode=True).decode('utf-8')
                                break
                    else:
                        message_body = msg.get_payload(decode=True).decode('utf-8')

                    features = {
                        'number_of_words': len(message_body.split()),
                        'number_of_stop_words': len([word for word in message_body.split() if word in list(stopwords.words('english'))]),
                        'number_of_unique_words': len(set(message_body.split())),
                        'ratio_of_lowercase_to_uppercase': float(len([word for word in message_body.split() if word.islower()])) / len(message_body.split()),
                        'number_of_exclamation_points': message_body.count('!'),
                    }

                    target = 0 # 0 for ham, 1 for spam

                    # Tokenize the message body
                    tokens = nltk.word_tokenize(message_body)

                    # Remove stop words
                    stop_words = nltk.corpus.stopwords.words('english')
                    tokens = [token for token in tokens if token not in stop_words]

                    # Stem the tokens
                    stemmer = nltk.stem.PorterStemmer()
                    stemmed_tokens = [stemmer.stem(token) for token in tokens]

                    # Lemmatize the tokens
                    lemmatizer = nltk.stem.WordNetLemmatizer()
                    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

                    features['number_of_unique_stemmed_words'] = len(set(stemmed_tokens))
                    features['number_of_lemmatized_words'] = len(set(lemmatized_tokens))

                    cleaned_body = ' '.join([stemmer.stem(token) for token in message_body.split() if token not in stop_words])

                    writer.writerow([index, message_body] + list(features.values()) + [cleaned_body] + [target])

                    index += 1

if __name__ == '__main__':
    root_folder = 'data\enron_eml_ham'
    output_csv = 'data\enron_proccessed.csv'

    extract_email_info(root_folder, output_csv)

### Spam

In [15]:
print(1282 + 1935)

3217


In [31]:
def extract_email_info(root_folder, output_csv):
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Index', 'Message Body', 'Number of Words', 'Number of Stop Words', 'Number of Unique Words', 'Ratio of Lowercase to Uppercase', 'Number of Exclamation Points', 'Number of Unique Stemmed Words', 'Number of Lemmatized Words', 'Cleaned Body', 'Target'])

        index = 1
        spam_count = 0

        for root, dirs, files in os.walk(root_folder):
            for filename in files:
                file_path = os.path.join(root, filename)

                # Process only files with .eml extension
                if os.path.splitext(filename)[1] == '.eml':
                    with open(file_path, 'rb') as file:
                        eml_data = file.read()

                    msg = email.message_from_bytes(eml_data)

                    message_body = ''
                    if msg.is_multipart():
                        for part in msg.walk():
                            content_type = part.get_content_type()
                            if content_type == 'text/plain':
                                message_body = part.get_payload(decode=True).decode('utf-8')
                                break
                    else:
                        message_body = msg.get_payload(decode=True).decode('utf-8')

                    features = {
                        'number_of_words': len(message_body.split()),
                        'number_of_stop_words': len([word for word in message_body.split() if word in list(stopwords.words('english'))]),
                        'number_of_unique_words': len(set(message_body.split())),
                        'ratio_of_lowercase_to_uppercase': float(len([word for word in message_body.split() if word.islower()])) / len(message_body.split()),
                        'number_of_exclamation_points': message_body.count('!'),
                    }

                    target = 1 # 0 for ham, 1 for spam

                    # Tokenize the message body
                    tokens = nltk.word_tokenize(message_body)

                    # Remove stop words
                    stop_words = nltk.corpus.stopwords.words('english')
                    tokens = [token for token in tokens if token not in stop_words]

                    # Stem the tokens
                    stemmer = nltk.stem.PorterStemmer()
                    stemmed_tokens = [stemmer.stem(token) for token in tokens]

                    # Lemmatize the tokens
                    lemmatizer = nltk.stem.WordNetLemmatizer()
                    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

                    features['number_of_unique_stemmed_words'] = len(set(stemmed_tokens))
                    features['number_of_lemmatized_words'] = len(set(lemmatized_tokens))

                    cleaned_body = ' '.join([stemmer.stem(token) for token in message_body.split() if token not in stop_words])

                    writer.writerow([index, message_body] + list(features.values()) + [cleaned_body] + [target])

                    index += 1
                    spam_count += 1
                    
                    if spam_count == 5614:
                        break

if __name__ == '__main__':
    root_folder = 'data\enron_eml_spam'
    output_csv = 'data\enron_proccessed_spam.csv'

    extract_email_info(root_folder, output_csv)

## Spam Assassin

### Ham

#### Ham 1

In [17]:
import os
import chardet

def get_encoding_counts(directory):
  encoding_counts = {}
  for file in os.listdir(directory):
    encoding = check_encoding(os.path.join(directory, file))
    if encoding not in encoding_counts:
      encoding_counts[encoding] = 0
    encoding_counts[encoding] += 1
  return encoding_counts

def check_encoding(file_path):
    with open(file_path, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']
    return encoding

def main():
  directory = 'data\spamassassin\easy_ham'
  encoding_counts = get_encoding_counts(directory)
  for encoding, count in encoding_counts.items():
    print(f'{encoding}: {count}')

if __name__ == '__main__':
  main()


ascii: 2409
Windows-1252: 45
ISO-8859-1: 97


In [18]:
import os
import csv

def process_file(file_path):
  with open(file_path, 'r', encoding='ascii') as file:
    content = file.read()
  text = content.strip()
  return text

def main():
  directory = 'data\spamassassin\easy_ham'
  csv_file = open('data\spamassassin_raw_1.csv', 'w', newline='')
  writer = csv.writer(csv_file, delimiter=',')
  for file in os.listdir(directory):
    encoding = check_encoding(os.path.join(directory, file))
    if encoding == 'ascii':
      text = process_file(os.path.join(directory, file))
      writer.writerow([text])

if __name__ == '__main__':
  main()

In [29]:
from nltk.corpus import stopwords

def extract_features(input_csv, output_csv):
    with open(input_csv, 'r', newline='') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        with open(output_csv, 'w', newline='') as csv_file:
            writer = csv.writer(csv_file, delimiter=',')
            writer.writerow(['Index', 'Message Body', 'Number of Words', 'Number of Stop Words', 'Number of Unique Words', 'Ratio of Lowercase to Uppercase', 'Number of Exclamation Points', 'Number of Unique Stemmed Words', 'Number of Lemmatized Words', 'Cleaned Body', 'Target'])

            index = 1
            target = 0
            for row in reader:
                message_body = row[0]

                if message_body == '':
                    features = {
                        'number_of_words': 0,
                        'number_of_stop_words': 0,
                        'number_of_unique_words': 0,
                        'ratio_of_lowercase_to_uppercase': 0,
                        'number_of_exclamation_points': 0,
                        'number_of_unique_stemmed_words': 0,
                        'number_of_lemmatized_words': 0,
                        'cleaned_body': '',
                    }
                else:
                    features = {
                        'number_of_words': len(message_body.split()),
                        'number_of_stop_words': len([word for word in message_body.split() if word in list(stopwords.words('english'))]),
                        'number_of_unique_words': len(set(message_body.split())),
                        'ratio_of_lowercase_to_uppercase': float(len([word for word in message_body.split() if word.islower()])) / len(message_body.split()),
                        'number_of_exclamation_points': message_body.count('!'),
                    }

                    target = 0 # 0 for ham, 1 for spam

                    # Tokenize the message body
                    tokens = nltk.word_tokenize(message_body)

                    # Remove stop words
                    stop_words = nltk.corpus.stopwords.words('english')
                    tokens = [token for token in tokens if token not in stop_words]

                    # Stem the tokens
                    stemmer = nltk.stem.PorterStemmer()
                    stemmed_tokens = [stemmer.stem(token) for token in tokens]

                    # Lemmatize the tokens
                    lemmatizer = nltk.stem.WordNetLemmatizer()
                    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

                    features['number_of_unique_stemmed_words'] = len(set(stemmed_tokens))
                    features['number_of_lemmatized_words'] = len(set(lemmatized_tokens))

                    cleaned_body = ' '.join([stemmer.stem(token) for token in message_body.split() if token not in stop_words])

                    writer.writerow([index, message_body] + list(features.values()) + [cleaned_body] + [target])


if __name__ == '__main__':
    input_csv = 'data\spamassassin_raw_1.csv'
    output_csv = 'data\spamassassin_processed_1.csv'

    extract_features(input_csv, output_csv)


In [30]:
import os
import csv

def get_dimensions(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file, delimiter=',')
        lines = 0
        columns = 0
        for row in reader:
            lines += 1
            columns = len(row)
    return lines, columns

if __name__ == '__main__':
    csv_file = 'data/spamassassin_processed_1.csv'
    lines, columns = get_dimensions(csv_file)
    print(f'The file has {lines} lines and {columns} columns.')

The file has 2410 lines and 11 columns.


#### Ham 2

In [19]:
import os
import chardet

def get_encoding_counts(directory):
  encoding_counts = {}
  for file in os.listdir(directory):
    encoding = check_encoding(os.path.join(directory, file))
    if encoding not in encoding_counts:
      encoding_counts[encoding] = 0
    encoding_counts[encoding] += 1
  return encoding_counts

def check_encoding(file_path):
    with open(file_path, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']
    return encoding

def main():
  directory = 'data\spamassassin\easy_ham_2'
  encoding_counts = get_encoding_counts(directory)
  for encoding, count in encoding_counts.items():
    print(f'{encoding}: {count}')

if __name__ == '__main__':
  main()


ascii: 1270
None: 120
utf-8: 11


In [20]:
print(1270 + 120 + 11)

1401


This code turns all the ascii files into a .csv with one row, containing the body of the email. 

In [21]:
# import os
# import csv

# def process_file(file_path):
#   with open(file_path, 'r', encoding='utf-8') as file:
#     content = file.read()
#   text = content.strip()
#   return text

# def main():
#   directory = 'data\spamassassin\easy_ham_2'
#   csv_file = open('data\spamassassin_raw_2.csv', 'w', newline='')
#   writer = csv.writer(csv_file, delimiter=',')
#   for file in os.listdir(directory):
#     text = process_file(os.path.join(directory, file))
#     writer.writerow([text])

# if __name__ == '__main__':
#   main()


In [22]:
import os
import csv

def process_file(file_path):
  with open(file_path, 'r', encoding='ascii') as file:
    content = file.read()
  text = content.strip()
  return text

def main():
  directory = 'data\spamassassin\easy_ham_2'
  csv_file = open('data\spamassassin_raw_2.csv', 'w', newline='')
  writer = csv.writer(csv_file, delimiter=',')
  for file in os.listdir(directory):
    encoding = check_encoding(os.path.join(directory, file))
    if encoding == 'ascii':
      text = process_file(os.path.join(directory, file))
      writer.writerow([text])

if __name__ == '__main__':
  main()


In [23]:
import os
import csv

def get_dimensions(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file, delimiter=',')
        lines = 0
        columns = 0
        for row in reader:
            lines += 1
            columns = len(row)
    return lines, columns

if __name__ == '__main__':
    csv_file = 'data/spamassassin_raw_2.csv'
    lines, columns = get_dimensions(csv_file)
    print(f'The file has {lines} lines and {columns} columns.')

The file has 1270 lines and 1 columns.


In [26]:
from nltk.corpus import stopwords

def extract_features(input_csv, output_csv):
    with open(input_csv, 'r', newline='') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        with open(output_csv, 'w', newline='') as csv_file:
            writer = csv.writer(csv_file, delimiter=',')
            writer.writerow(['Index', 'Message Body', 'Number of Words', 'Number of Stop Words', 'Number of Unique Words', 'Ratio of Lowercase to Uppercase', 'Number of Exclamation Points', 'Number of Unique Stemmed Words', 'Number of Lemmatized Words', 'Cleaned Body', 'Target'])

            index = 1
            target = 0
            for row in reader:
                message_body = row[0]

                if message_body == '':
                    features = {
                        'number_of_words': 0,
                        'number_of_stop_words': 0,
                        'number_of_unique_words': 0,
                        'ratio_of_lowercase_to_uppercase': 0,
                        'number_of_exclamation_points': 0,
                        'number_of_unique_stemmed_words': 0,
                        'number_of_lemmatized_words': 0,
                        'cleaned_body': '',
                    }
                else:
                    features = {
                        'number_of_words': len(message_body.split()),
                        'number_of_stop_words': len([word for word in message_body.split() if word in list(stopwords.words('english'))]),
                        'number_of_unique_words': len(set(message_body.split())),
                        'ratio_of_lowercase_to_uppercase': float(len([word for word in message_body.split() if word.islower()])) / len(message_body.split()),
                        'number_of_exclamation_points': message_body.count('!'),
                    }

                    target = 0 # 0 for ham, 1 for spam

                    # Tokenize the message body
                    tokens = nltk.word_tokenize(message_body)

                    # Remove stop words
                    stop_words = nltk.corpus.stopwords.words('english')
                    tokens = [token for token in tokens if token not in stop_words]

                    # Stem the tokens
                    stemmer = nltk.stem.PorterStemmer()
                    stemmed_tokens = [stemmer.stem(token) for token in tokens]

                    # Lemmatize the tokens
                    lemmatizer = nltk.stem.WordNetLemmatizer()
                    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

                    features['number_of_unique_stemmed_words'] = len(set(stemmed_tokens))
                    features['number_of_lemmatized_words'] = len(set(lemmatized_tokens))

                    cleaned_body = ' '.join([stemmer.stem(token) for token in message_body.split() if token not in stop_words])

                    writer.writerow([index, message_body] + list(features.values()) + [cleaned_body] + [target])


if __name__ == '__main__':
    input_csv = 'data\spamassassin_raw_2.csv'
    output_csv = 'data\spamassassin_processed_2.csv'

    extract_features(input_csv, output_csv)


In [28]:
import os
import csv

def get_dimensions(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file, delimiter=',')
        lines = 0
        columns = 0
        for row in reader:
            lines += 1
            columns = len(row)
    return lines, columns

if __name__ == '__main__':
    csv_file = 'data/spamassassin_processed_2.csv'
    lines, columns = get_dimensions(csv_file)
    print(f'The file has {lines} lines and {columns} columns.')

The file has 1271 lines and 11 columns.
