# PREPROCESSING OF TREC 06 DATA
by: John Markton Olarte
This is done in a seperate notebook to avoid having to re-run the preprocesing in the main notebook (hw6-olarte.ipynb).

The emails will be preprocessed in this notebook, and as an output we will have the following csv files:
a. train_ham.csv - contains the preprocessed ham emails from the training set (folders 0 to 70)
b. train_spam.csv - contains the preprocessed spam emails from the training set (folders 0 to 70)
d. test_set.csv - contains the preprocessed emails from the test set (both ham and spam, folders: 71 to 126)
d. preprocessed_emails.csv - contains the preprocessed emails from the training and test sets (both ham and spam, folders: 0 to 126)
This CSV files will then be used in the main notebook (hw6-olarte.ipynb) to train the model and test it.

In [None]:
# Import Necessary Libraries
import os # This will be used to access the files in the trec06 directory
import re # This will be used to remove the html tags from the emails
# NOTE: Removal of html tags in the email data was not mentioned in the problem set guide, but I decided
#   to add this, as upon manual inspection of the data, I noticed that there were html tags in the data,
#   which can affect the accuracy of the model if not removed.
import email # This will be used to parse the emails
import pandas as pd # This will be used to create the dataframe and exporting of files to csv

## Initialize Main Dataframe and Labels Dataframe

In [None]:
# MAIN DATAFRAME

df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
# COLUMNS: folder, file, email_message, category
#   folder: folder where the email is located
#   file: file name of the email
#   email_message: email message
#   category: 0-ham, 1-spam
df_main # This should be empty at this point

In [None]:
# LABELS DATAFRAME
# NOTE: This is a temporary dataframe that will be used to store the labels of the emails based on the labels file
path_to_labels = "trec06/labels"

# Upon inspection of the labels file, I noticed that the labels are in the format of "category file_path"
#    we will use this in our advantage as we can use the file_path:../data/{folder}/{file} to label the emails later in the main dataframe
df_labels = pd.read_csv("trec06/labels", sep=" ", header=None)
# Assign the column names
df_labels.columns = ["category", "file_path"]
# Change category from ham/spam to 0/1
df_labels["category"] = df_labels["category"].apply(lambda x: 0 if x == "ham" else 1)
# Remove "../data/" from file_path
df_labels["file_path"] = df_labels["file_path"].apply(lambda x: x.replace("../data/", ""))

# Show the labels dataframe
df_labels

## FUNCTIONS FOR REMOVING USELESS INFORMATION, AND GETTING MESSAGE BODY

In [None]:
# Initialize the path to the data directory and list the folders in the data directory
folder_path = "trec06/data"
folders = os.listdir(folder_path)
folders # This is to show the folders in the data directory
# NOTE: We can see that there is no folder:127 which was indicated end of range for test data in the problem set guide
#    However upon checking on the zipped file README, we can see that the test data is from 71-126 only, so we will use this range.

In [None]:
# List of useless information to be removed from the email
stop_words = []
punctuations = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~\\"
numbers = "0123456789"
html_tags = re.compile('<.*?>') # This will be used to remove html tags from the email, remember html tags are enclosed in <>

# Use stop_words file to get the stop words
with open("stop_words.txt", "r") as f:
    stop_word = f.read().splitlines()
    stop_words = [word for word in stop_word]

In [None]:
# Function to remove useless pieces of information from the email
def remove_useless_info(message):
    # Convert to lower case, since stop words are in lower case
    message = message.lower()
    # remove html tags
    message = re.sub(html_tags, '', message)
    # Remove symbols
    message = message.translate(str.maketrans('', '', punctuations))
    # Remove numbers
    message = message.translate(str.maketrans('', '', numbers))
    # split message into words
    words = message.split()
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Rejoin words into a message
    message = " ".join(words)
    return message

In [None]:
# Function to collect messagge from parsed email
def get_message(parsed_email):
    message = ""
    # Check if the email is multipart
    if parsed_email.is_multipart():
        # Loop through the parts of the email
        for part in parsed_email.walk():
            # Check if the part is text/plain
            if part.get_content_type() == "text/plain":
                # Get the message
                message = part.get_payload()
                break
    # If the email is not multipart, just get the message
    else:
        message = parsed_email.get_payload()
    return message

In [None]:
# Before we loop through all 30,000 emails. Let us first try our functions on a single email to see it it works properly
test_folder = "012"
test_file = "103"
test_path = f"trec06/data/{test_folder}/{test_file}"
charset = ""
with open(test_path, "r") as f:
    # Get the charset of the email
    charset = email.message_from_file(f).get_content_charset()
charset = charset if charset else "windows-1251" # If charset is None, set it to utf-8

# We will use encoding="ISO-8859-1" since emails are usually encoded in this format
with open(test_path, 'r', encoding=charset) as e_mail:
    read_email = e_mail.read()
    parsed_email = email.message_from_string(read_email)

    # Original email message
    message = get_message(parsed_email)
    print(f"original message: {message}")

    # Processed email message
    message = remove_useless_info(message)
    print(f"processed message: {message}")
e_mail.close()

Now we know that it is working, we can now apply it to the whole dataset.

## PREPROCESSING OF THE WHOLE TREC06 DATASET

### There are different ways I wanted to preprocess the data:
<ol>
<li> [a] I'll just preprocess each email in a standard charset (ISO-8859-1), and then remove any useless information. Also all data will be preserved regardless of how they are encoded. </li>
<li> [b] I'll process each email with their corresponding charset, and then remove any useless information. </li>
<li> [c] A variation of (a), wherein blanked messages will be removed. </li>
<li> [d] A variation of (b), wherein blanked messages will be removed. </li>
<li> [e] variation of (c), wherein all non-ascii chars are also removed. </li>
</ol>
There are a lot more variations, but for the sake of time we'll just do these 5.

#### Preprocessing type (a)

In [None]:
# preprocessing type [a]

# Read all emails and put them in a dataframe
for folder in folders:
    # Get the files in the folder
    files = os.listdir(f"{folder_path}/{folder}")
    for file in files:
        with open(f"{folder_path}/{folder}/{file}", "r", encoding="ISO-8859-1") as e_mail:
            # Read email file
            read_email = e_mail.read()
            # Parse email
            parsed_email = email.message_from_string(read_email)
            # Get message
            message = get_message(parsed_email)
            # Remove useless information
            message = remove_useless_info(message)
            # Get the category of the email based on the labels df
            category_label = df_labels[df_labels["file_path"] == f"{folder}/{file}"]["category"].values[0]
            # Concatenate the data to the df_main
            df_main = pd.concat([df_main, pd.DataFrame([[folder, file, message, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)
# Show the main dataframe
df_main

In [None]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_a.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_a.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop(df_main.index, inplace=True)
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

#### Preprocessing type (b)

In [None]:
# preprocessing type [b]

# Read all emails and put them in a dataframe
for folder in folders:
    # Get all files in folder
    files = os.listdir(f"{folder_path}/{folder}")
    for file in files:
        # Determine the charset of the email
        charset = ""
        try:
            with open(f"{folder_path}/{folder}/{file}", "r") as f:
                # Get the charset of the email
                charset = email.message_from_file(f).get_content_charset()
        except:
            charset = None
        charset = charset if charset else "windows-1251" # If charset is None, set it to utf-8

        try:
            with open(f"{folder_path}/{folder}/{file}", "r", encoding=charset) as e_mail:
                # Read email file
                read_email = e_mail.read()
                # Parse email
                parsed_email = email.message_from_string(read_email)
                # Get message
                message = get_message(parsed_email)
                # Remove useless information
                message = remove_useless_info(message)
                # Get the category of the email based on the labels df
                category_label = df_labels[df_labels["file_path"] == f"{folder}/{file}"]["category"].values[0]
                # Concatenate the data to the df_main
                df_main = pd.concat([df_main, pd.DataFrame([[folder, file, message, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)
        except:
            # Open the file in binary mode
            with open(f"{folder_path}/{folder}/{file}", "rb") as e_mail:
                # Read email file
                read_email = e_mail.read()
                # Parse email
                parsed_email = email.message_from_bytes(read_email)
                # Get message
                message = get_message(parsed_email)
                # Remove useless information
                message = remove_useless_info(message)
                # Get the category of the email based on the labels df
                category_label = df_labels[df_labels["file_path"] == f"{folder}/{file}"]["category"].values[0]
                # Concatenate the data to the df_main
                df_main = pd.concat([df_main, pd.DataFrame([[folder, file, message, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)

# Show the main dataframe
df_main

In [None]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_b.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_b.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop()
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

#### Preprocessing type (c)

In [None]:
# preprocessing type [c]

# Since this is a variation of preprocessing type [a], we can use the exported csv file from preprocessing type [a]
df_main = pd.read_csv("preprocessed_files/preprocessed_emails_a.csv")

# Remove all messages that are empty
df_main = df_main[df_main["email_message"] != ""]

# Show the main dataframe
df_main

In [None]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_c.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_c.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop()
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

#### Preprocessing type (d)

In [None]:
# preprocessing type [d]

# Since this is a variation of preprocessing type [b], we can use the exported csv file from preprocessing type [b]
df_main = pd.read_csv("preprocessed_files/preprocessed_emails_b.csv")

# Remove all messages that are empty
df_main = df_main[df_main["email_message"] != ""]

# Show the main dataframe
df_main

In [None]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_d.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_d.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop()
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

#### Preprocessing type (e)

In [None]:
# preprocessing type [e]

# Since this is a variation of preprocessing type [c], we can use the exported csv file from preprocessing type [c]
df_main = pd.read_csv("preprocessed_files/preprocessed_emails_b.csv")

# Process messages and remove characters that are not ascii
df_main["email_message"] = df_main["email_message"].apply(lambda x: x.encode("ascii", "ignore").decode())

# Show the main dataframe
df_main

In [None]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_e.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_e.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop()
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again