# Imports

In [5]:
import pandas as pd
import numpy as np
import os
import csv
import email
import re
import stopwords
import nltk
from nltk.corpus import stopwords

import os
from email.message import EmailMessage

# Convert Plain Text Files To .eml

## Ham

In [12]:
def convert_to_eml(root_folder, output_folder):
    for root, dirs, files in os.walk(root_folder):
        for filename in files:
            file_path = os.path.join(root, filename)

            # Process only files (without extension)
            if not os.path.splitext(filename)[1]:
                output_path = os.path.join(output_folder, f'{filename}.eml')

                try:
                    # Read raw text content with UTF-8 encoding
                    with open(file_path, 'r', encoding='utf-8') as file:
                        raw_message = file.read()
                except UnicodeDecodeError:
                    # Skip files not in UTF-8 encoding
                    continue

                # Create an EmailMessage object
                eml = EmailMessage()
                eml.set_content(raw_message)

                # Save as EML file
                with open(output_path, 'wb') as output_file:
                    output_file.write(eml.as_bytes())

# Usage example
root_folder = 'data\enron\ham'
output_folder = 'data\enron_eml_ham'

convert_to_eml(root_folder, output_folder)

## Spam

In [15]:
def convert_to_eml_spam(root_folder, output_folder):
    for root, dirs, files in os.walk(root_folder):
        for filename in files:
            file_path = os.path.join(root, filename)

            # Process only files with .txt extension
            if os.path.splitext(filename)[1] == '.txt':
                output_path = os.path.join(output_folder, f'{filename}.eml')

                try:
                    # Read raw text content with UTF-8 encoding
                    with open(file_path, 'r', encoding='utf-8') as file:
                        raw_message = file.read()
                except UnicodeDecodeError:
                    # Skip files not in UTF-8 encoding
                    continue

                # Create an EmailMessage object
                eml = EmailMessage()
                eml.set_content(raw_message)

                # Save as EML file
                with open(output_path, 'wb') as output_file:
                    output_file.write(eml.as_bytes())

# Usage example
root_folder = 'data\enron\spam'
output_folder = 'data\enron_eml_spam'

convert_to_eml_spam(root_folder, output_folder)


# Feature Extraction

## Ham

In [9]:
def extract_email_info(root_folder, output_csv):
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Index', 'Message Body', 'Number of Words', 'Number of Stop Words', 'Number of Unique Words', 'Ratio of Lowercase to Uppercase', 'Number of Exclamation Points', 'Target'])

        index = 1

        for root, dirs, files in os.walk(root_folder):
            for filename in files:
                file_path = os.path.join(root, filename)

                # Process only files with .eml extension
                if os.path.splitext(filename)[1] == '.eml':
                    with open(file_path, 'rb') as file:
                        eml_data = file.read()

                    msg = email.message_from_bytes(eml_data)

                    message_body = ''
                    if msg.is_multipart():
                        for part in msg.walk():
                            content_type = part.get_content_type()
                            if content_type == 'text/plain':
                                message_body = part.get_payload(decode=True).decode('utf-8')
                                break
                    else:
                        message_body = msg.get_payload(decode=True).decode('utf-8')

                    features = {
                        'number_of_words': len(message_body.split()),
                        'number_of_stop_words': len([word for word in message_body.split() if word in list(stopwords.words('english'))]),
                        'number_of_unique_words': len(set(message_body.split())),
                        'ratio_of_lowercase_to_uppercase': float(len([word for word in message_body.split() if word.islower()])) / len(message_body.split()),
                        'number_of_exclamation_points': message_body.count('!'),
                    }

                    target = 0 # 0 for ham, 1 for spam

                    writer.writerow([index, message_body] + list(features.values()) + [target])

                    index += 1

if __name__ == '__main__':
    root_folder = 'data\enron_eml'
    output_csv = 'data\enron_proccessed.csv'

    extract_email_info(root_folder, output_csv)

## Spam

In [16]:
def extract_email_info(root_folder, output_csv):
    with open(output_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Index', 'Message Body', 'Number of Words', 'Number of Stop Words', 'Number of Unique Words', 'Ratio of Lowercase to Uppercase', 'Number of Exclamation Points', 'Target'])

        index = 1

        for root, dirs, files in os.walk(root_folder):
            for filename in files:
                file_path = os.path.join(root, filename)

                # Process only files with .eml extension
                if os.path.splitext(filename)[1] == '.eml':
                    with open(file_path, 'rb') as file:
                        eml_data = file.read()

                    msg = email.message_from_bytes(eml_data)

                    message_body = ''
                    if msg.is_multipart():
                        for part in msg.walk():
                            content_type = part.get_content_type()
                            if content_type == 'text/plain':
                                message_body = part.get_payload(decode=True).decode('utf-8')
                                break
                    else:
                        message_body = msg.get_payload(decode=True).decode('utf-8')

                    features = {
                        'number_of_words': len(message_body.split()),
                        'number_of_stop_words': len([word for word in message_body.split() if word in list(stopwords.words('english'))]),
                        'number_of_unique_words': len(set(message_body.split())),
                        'ratio_of_lowercase_to_uppercase': float(len([word for word in message_body.split() if word.islower()])) / len(message_body.split()),
                        'number_of_exclamation_points': message_body.count('!'),
                    }

                    target = 1 # 0 for ham, 1 for spam

                    writer.writerow([index, message_body] + list(features.values()) + [target])

                    index += 1

if __name__ == '__main__':
    root_folder = 'data\enron_eml_spam'
    output_csv = 'data\enron_proccessed_spam.csv'

    extract_email_info(root_folder, output_csv)