# PREPROCESSING OF TREC 06 DATA
by: John Markton Olarte
This is done in a seperate notebook to avoid having to re-run the preprocesing in the main notebook (hw6-olarte.ipynb).

In [1]:
# Import Necessary Libraries
import os # This will be used to access the files in the trec06 directory
import re # This will be used to remove the html tags from the emails
# NOTE: Removal of html tags in the email data was not mentioned in the problem set guide, but I decided
#   to add this, as upon manual inspection of the data, I noticed that there were html tags in the data,
#   which can affect the accuracy of the model if not removed.
import email # This will be used to parse the emails
import pandas as pd # This will be used to create the dataframe and exporting of files to csv

## Initialize Main Dataframe and Labels Dataframe

In [2]:
# MAIN DATAFRAME

df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
# COLUMNS: folder, file, email_message, category
#   folder: folder where the email is located
#   file: file name of the email
#   email_message: email message
#   category: 0-ham, 1-spam
df_main # This should be empty at this point

Unnamed: 0,folder,file,email_message,category


In [3]:
# LABELS DATAFRAME
# NOTE: This is a temporary dataframe that will be used to store the labels of the emails based on the labels file
path_to_labels = "trec06/labels"

# Upon inspection of the labels file, I noticed that the labels are in the format of "category file_path"
#    we will use this in our advantage as we can use the file_path:../data/{folder}/{file} to label the emails later in the main dataframe
df_labels = pd.read_csv("trec06/labels", sep=" ", header=None)
# Assign the column names
df_labels.columns = ["category", "file_path"]
# Change category from ham/spam to 0/1
df_labels["category"] = df_labels["category"].apply(lambda x: 0 if x == "ham" else 1)
# Remove "../data/" from file_path
df_labels["file_path"] = df_labels["file_path"].apply(lambda x: x.replace("../data/", ""))

# Show the labels dataframe
df_labels

Unnamed: 0,category,file_path
0,0,000/000
1,1,000/001
2,1,000/002
3,0,000/003
4,1,000/004
...,...,...
37817,1,126/017
37818,1,126/018
37819,1,126/019
37820,1,126/020


## FUNCTIONS FOR REMOVING USELESS INFORMATION, AND GETTING MESSAGE BODY

In [4]:
# Initialize the path to the data directory and list the folders in the data directory
folder_path = "trec06/data"
folders = os.listdir(folder_path)
folders # This is to show the folders in the data directory
# NOTE: We can see that there is no folder:127 which was indicated end of range for test data in the problem set guide
#    However upon checking on the zipped file README, we can see that the test data is from 71-126 only, so we will use this range.

['000',
 '001',
 '002',
 '003',
 '004',
 '005',
 '006',
 '007',
 '008',
 '009',
 '010',
 '011',
 '012',
 '013',
 '014',
 '015',
 '016',
 '017',
 '018',
 '019',
 '020',
 '021',
 '022',
 '023',
 '024',
 '025',
 '026',
 '027',
 '028',
 '029',
 '030',
 '031',
 '032',
 '033',
 '034',
 '035',
 '036',
 '037',
 '038',
 '039',
 '040',
 '041',
 '042',
 '043',
 '044',
 '045',
 '046',
 '047',
 '048',
 '049',
 '050',
 '051',
 '052',
 '053',
 '054',
 '055',
 '056',
 '057',
 '058',
 '059',
 '060',
 '061',
 '062',
 '063',
 '064',
 '065',
 '066',
 '067',
 '068',
 '069',
 '070',
 '071',
 '072',
 '073',
 '074',
 '075',
 '076',
 '077',
 '078',
 '079',
 '080',
 '081',
 '082',
 '083',
 '084',
 '085',
 '086',
 '087',
 '088',
 '089',
 '090',
 '091',
 '092',
 '093',
 '094',
 '095',
 '096',
 '097',
 '098',
 '099',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',


In [5]:
# List of useless information to be removed from the email
stop_words = []
punctuations = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~\\"
numbers = "0123456789"
html_tags = re.compile('<.*?>') # This will be used to remove html tags from the email, remember html tags are enclosed in <>
esc_chars = re.compile(r'\\[a-z][a-z]?[0-9]+') # This will be used to remove escape characters from the email, remember escape characters are enclosed in \ and are followed by a letter and a number
# Use stop_words file to get the stop words

with open("stop_words.txt", "r") as f:
    stop_word = f.read().splitlines()
    stop_words = [word for word in stop_word]

In [6]:
# Function to remove useless pieces of information from the email
def remove_useless_info(message):
    # Convert to lower case, since stop words are in lower case
    message = message.lower()
    # remove html tags
    message = re.sub(html_tags, '', message)
    # Remove symbols
    message = message.translate(str.maketrans('', '', punctuations))
    # Remove numbers
    message = message.translate(str.maketrans('', '', numbers))
    # Remove escape characters
    message = re.sub(esc_chars, '', message)
    # split message into words
    words = message.split()
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Rejoin words into a message
    message = " ".join(words)
    return message

In [7]:
# Function to collect messagge from parsed email
def get_message(parsed_email):
    message = ""
    # Check if the email is multipart
    if parsed_email.is_multipart():
        # Loop through the parts of the email
        for part in parsed_email.walk():
            # Check if the part is text/plain
            if part.get_content_type() == "text/plain":
                # Get the message
                message = part.get_payload()
                break
    # If the email is not multipart, just get the message
    else:
        message = parsed_email.get_payload()
    return message

In [8]:
# Before we loop through all 30,000 emails. Let us first try our functions on a single email to see it it works properly
test_folder = "012"
test_file = "103"
test_path = f"trec06/data/{test_folder}/{test_file}"
charset = ""
with open(test_path, "r") as f:
    # Get the charset of the email
    charset = email.message_from_file(f).get_content_charset()
charset = charset if charset else "windows-1251" # If charset is None, set it to utf-8

# We will use encoding="ISO-8859-1" since emails are usually encoded in this format
with open(test_path, 'r', encoding=charset) as e_mail:
    read_email = e_mail.read()
    parsed_email = email.message_from_string(read_email)

    # Original email message
    message = get_message(parsed_email)
    print(f"original message: {message}")

    # Processed email message
    message = remove_useless_info(message)
    print(f"processed message: {message}")
e_mail.close()

original message: ☆無料体験貴方だけに☆
 
あなたにお会いしたいという女性から連絡が来ております！
1000人に一人の完全VIP待遇でのご招待となりますので貴方だけの御優待となります！
さあ、彼女をさそって、人生の勝ち組みになりましょう。
まとまったお金が欲しい貴方、人生のパートナーが欲しい貴方、借金から開放されたい貴方に、ぴったりの女性を紹介します。
http://www.veyv.com/?num=310
【女性紹介】
■美咲
■29歳
評：スタイル抜群のかわいいキュートな女性。はみ出した太もも、お尻と腰にぴったりのホットパンツファッションは芸術品です。連絡してくれたら彼女の顔と伸縮のあるホットパンツにお尻のラインがクッキリの写メをまず送ってくれるとの事です。年収1100万。尽くしてくれます。
『近くの分譲マンションに住んでいます。仕事だけの寂しい毎日ですIT関係とモデルの仕事なので毎日が遅いんです。何時、貴方から連絡が来るのか心待ちにしています。付き合ってくださるのなら、私貴方に尽くします。私、尽くす女なんですよ。昔、私が働き初めでお金がない頃、付き合っていた彼氏の借金返済のため、彼氏に頼まれて、イモを食べて一番多くおならしたら200万円の賞金がもらえる大会に出場して、優勝してしまいました。恥ずかしい話なんですけど、その時は必死でしたね。もちろんサングラスをかけながら出場しましたけどね。私と会う前に連絡くださいね。遊びに来てくれたら、美咲のスパゲッティカルボナーラご馳走しますよ。美咲のフェアレディＺでドライブもしませんか？』
 
■三浦恵理子
■30代前半
評：Ｆカップ。キュートさと大人の雰囲気の女性。ロングの髪と腰のくびれとヒップラインは色気抜群！年収1050万円
スタイルがいいねって言われます。特にヒップラインと網タイツ姿に自信があります…スポーツジムに行くと男性の視線がくぎ付けとなります。以前、ジムでバーベルを持ち上げていたら、ふんばりすぎて放尿してしまって、近くで見ていた中年のおじ様がそれが欲しいとしつこく迫るので、仕方ないから差し上げちゃいました。ぜひ、貴方と肌と肌を重ね合わせたいので、お返事くださいね。


■千亜紀
■30代
評：ご主人が貿易会社で海外単身赴任。体をもてあましています。すらりとした脚にはさまれたら最高です。資産

Now we know that it is working, we can now apply it to the whole dataset.

## PREPROCESSING OF THE WHOLE TREC06 DATASET

### There are different ways I wanted to preprocess the data:
<ol>
<li> [a] I'll just preprocess each email in a standard charset (ISO-8859-1), and then remove any useless information. Also all data will be preserved regardless of how they are encoded. </li>
<li> [b] I'll process each email with their corresponding charset, and then remove any useless information. </li>
<li> [c] A variation of (a), wherein blanked messages will be removed. </li>
<li> [d] A variation of (b), wherein blanked messages will be removed. </li>
<li> [e] variation of (c), wherein all non-ascii chars are also removed. </li>
</ol>
There are a lot more variations, but for the sake of time we'll just do these 5.

#### Preprocessing type (a)

In [9]:
# preprocessing type [a]

# Read all emails and put them in a dataframe
for folder in folders:
    # Get the files in the folder
    files = os.listdir(f"{folder_path}/{folder}")
    for file in files:
        with open(f"{folder_path}/{folder}/{file}", "r", encoding="ISO-8859-1") as e_mail:
            # Read email file
            read_email = e_mail.read()
            # Parse email
            parsed_email = email.message_from_string(read_email)
            # Get message
            message = get_message(parsed_email)
            # Remove useless information
            message = remove_useless_info(message)
            # Get the category of the email based on the labels df
            category_label = df_labels[df_labels["file_path"] == f"{folder}/{file}"]["category"].values[0]
            # Concatenate the data to the df_main
            df_main = pd.concat([df_main, pd.DataFrame([[folder, file, message, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)
# Show the main dataframe
df_main

Unnamed: 0,folder,file,email_message,category
0,000,000,mailing list queried weeks ago running set arc...,0
1,000,001,luxury watches buy rolex rolex cartier bvlgari...,1
2,000,002,academic qualifications prestigious nonacc red...,1
3,000,003,greetings verify subscription planfans list ch...,0
4,000,004,chauncey conferred luscious continued tonsillitis,1
...,...,...,...,...
37817,126,017,great news expec ted infinex ventures infx pri...,1
37818,126,018,oil sector going crazy weekly gift kkpt thing ...,1
37819,126,019,httpvdtobjdocscaninfo suffering pain depressio...,1
37820,126,020,prosperous future increased money earning powe...,1


In [10]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_a.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_a.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop(df_main.index, inplace=True)
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

Unnamed: 0,folder,file,email_message,category


#### Preprocessing type (b)

In [11]:
# preprocessing type [b]

# Read all emails and put them in a dataframe
for folder in folders:
    # Get all files in folder
    files = os.listdir(f"{folder_path}/{folder}")
    for file in files:
        # Determine the charset of the email
        charset = ""
        try:
            with open(f"{folder_path}/{folder}/{file}", "r") as f:
                # Get the charset of the email
                charset = email.message_from_file(f).get_content_charset()
        except:
            charset = None
        charset = charset if charset else "windows-1251" # If charset is None, set it to utf-8

        try:
            with open(f"{folder_path}/{folder}/{file}", "r", encoding=charset) as e_mail:
                # Read email file
                read_email = e_mail.read()
                # Parse email
                parsed_email = email.message_from_string(read_email)
                # Get message
                message = get_message(parsed_email)
                # Remove useless information
                message = remove_useless_info(message)
                # Get the category of the email based on the labels df
                category_label = df_labels[df_labels["file_path"] == f"{folder}/{file}"]["category"].values[0]
                # Concatenate the data to the df_main
                df_main = pd.concat([df_main, pd.DataFrame([[folder, file, message, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)
        except:
            # Open the file in binary mode
            with open(f"{folder_path}/{folder}/{file}", "rb") as e_mail:
                # Read email file
                read_email = e_mail.read()
                # Parse email
                parsed_email = email.message_from_bytes(read_email)
                # Get message
                message = get_message(parsed_email)
                # Remove useless information
                message = remove_useless_info(message)
                # Get the category of the email based on the labels df
                category_label = df_labels[df_labels["file_path"] == f"{folder}/{file}"]["category"].values[0]
                # Concatenate the data to the df_main
                df_main = pd.concat([df_main, pd.DataFrame([[folder, file, message, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)

# Show the main dataframe
df_main

Unnamed: 0,folder,file,email_message,category
0,000,000,mailing list queried weeks ago running set arc...,0
1,000,001,luxury watches buy rolex rolex cartier bvlgari...,1
2,000,002,academic qualifications prestigious nonacc red...,1
3,000,003,greetings verify subscription planfans list ch...,0
4,000,004,chauncey conferred luscious continued tonsillitis,1
...,...,...,...,...
37817,126,017,great news expec ted infinex ventures infx pri...,1
37818,126,018,oil sector going crazy weekly gift kkpt thing ...,1
37819,126,019,httpvdtobjdocscaninfo suffering pain depressio...,1
37820,126,020,prosperous future increased money earning powe...,1


In [12]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_b.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_b.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop(df_main.index, inplace=True)
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

Unnamed: 0,folder,file,email_message,category


#### Preprocessing type (c)

In [13]:
# preprocessing type [c]

# Since this is a variation of preprocessing type [a], we can use the exported csv file from preprocessing type [a]
df_main = pd.read_csv("preprocessed_files/preprocessed_emails_a.csv")

# Remove all messages that are empty
df_main = df_main[df_main["email_message"] != ""]

# Show the main dataframe
df_main

Unnamed: 0,folder,file,email_message,category
0,0,0,mailing list queried weeks ago running set arc...,0
1,0,1,luxury watches buy rolex rolex cartier bvlgari...,1
2,0,2,academic qualifications prestigious nonacc red...,1
3,0,3,greetings verify subscription planfans list ch...,0
4,0,4,chauncey conferred luscious continued tonsillitis,1
...,...,...,...,...
37817,126,17,great news expec ted infinex ventures infx pri...,1
37818,126,18,oil sector going crazy weekly gift kkpt thing ...,1
37819,126,19,httpvdtobjdocscaninfo suffering pain depressio...,1
37820,126,20,prosperous future increased money earning powe...,1


In [14]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_c.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_c.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop(df_main.index, inplace=True)
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

Unnamed: 0,folder,file,email_message,category


#### Preprocessing type (d)

In [15]:
# preprocessing type [d]

# Since this is a variation of preprocessing type [b], we can use the exported csv file from preprocessing type [b]
df_main = pd.read_csv("preprocessed_files/preprocessed_emails_b.csv")

# Remove all messages that are empty
df_main = df_main[df_main["email_message"] != ""]

# Show the main dataframe
df_main

Unnamed: 0,folder,file,email_message,category
0,0,0,mailing list queried weeks ago running set arc...,0
1,0,1,luxury watches buy rolex rolex cartier bvlgari...,1
2,0,2,academic qualifications prestigious nonacc red...,1
3,0,3,greetings verify subscription planfans list ch...,0
4,0,4,chauncey conferred luscious continued tonsillitis,1
...,...,...,...,...
37817,126,17,great news expec ted infinex ventures infx pri...,1
37818,126,18,oil sector going crazy weekly gift kkpt thing ...,1
37819,126,19,httpvdtobjdocscaninfo suffering pain depressio...,1
37820,126,20,prosperous future increased money earning powe...,1


In [16]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_d.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_d.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop(df_main.index, inplace=True)
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

Unnamed: 0,folder,file,email_message,category


#### Preprocessing type (e)

In [17]:
# preprocessing type [e]

# Since this is a variation of preprocessing type [c], we can use the exported csv file from preprocessing type [c]
df_main = pd.read_csv("preprocessed_files/preprocessed_emails_b.csv")

# Process messages and remove characters that are not ascii
df_main["email_message"] = df_main["email_message"].apply(lambda x: str(x).encode("ascii", "ignore").decode())

# Show the main dataframe
df_main

Unnamed: 0,folder,file,email_message,category
0,0,0,mailing list queried weeks ago running set arc...,0
1,0,1,luxury watches buy rolex rolex cartier bvlgari...,1
2,0,2,academic qualifications prestigious nonacc red...,1
3,0,3,greetings verify subscription planfans list ch...,0
4,0,4,chauncey conferred luscious continued tonsillitis,1
...,...,...,...,...
37817,126,17,great news expec ted infinex ventures infx pri...,1
37818,126,18,oil sector going crazy weekly gift kkpt thing ...,1
37819,126,19,httpvdtobjdocscaninfo suffering pain depressio...,1
37820,126,20,prosperous future increased money earning powe...,1


In [18]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_e.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_e.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop(df_main.index, inplace=True)
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

Unnamed: 0,folder,file,email_message,category


## Preprocessing Type(f)
As part of the guide questions, we are to answer what will happen if we include the stop words.

In [19]:
# Redefine the remove_useless_info function

# Function to remove useless pieces of information from the email
# Stop Words will not be removed
def remove_useless_info(message):
    # Split the message into words and rejoin (to remove extra spaces and new lines)
    words = message.split()
    message = " ".join(words)
    # Convert to lower case, since stop words are in lower case
    message = message.lower()
    # remove html tags
    message = re.sub(html_tags, '', message)
    # Remove symbols
    message = message.translate(str.maketrans('', '', punctuations))
    # Remove numbers
    message = message.translate(str.maketrans('', '', numbers))
    # Remove escape characters
    message = re.sub(esc_chars, '', message)
    return message

In [20]:
# preprocessing type [f]

# Read all emails and put them in a dataframe
for folder in folders:
    # Get the files in the folder
    files = os.listdir(f"{folder_path}/{folder}")
    for file in files:
        with open(f"{folder_path}/{folder}/{file}", "r", encoding="ISO-8859-1") as e_mail:
            # Read email file
            read_email = e_mail.read()
            # Parse email
            parsed_email = email.message_from_string(read_email)
            # Get message
            message = get_message(parsed_email)
            # Remove useless information
            message = remove_useless_info(message)
            # Get the category of the email based on the labels df
            category_label = df_labels[df_labels["file_path"] == f"{folder}/{file}"]["category"].values[0]
            # Concatenate the data to the df_main
            df_main = pd.concat([df_main, pd.DataFrame([[folder, file, message, category_label]], columns=["folder", "file", "email_message", "category"])], ignore_index=True)
# Show the main dataframe
df_main

Unnamed: 0,folder,file,email_message,category
0,000,000,the mailing list i queried about a few weeks a...,0
1,000,001,luxury watches buy your own rolex for only r...,1
2,000,002,academic qualifications available from prestig...,1
3,000,003,greetings all this is to verify your subscript...,0
4,000,004,try chauncey may conferred the luscious not co...,1
...,...,...,...,...
37817,126,017,great news expec ted infinex ventures inc infx...,1
37818,126,018,the oil sector is going crazy this is our week...,1
37819,126,019,httpvdtobjdocscaninfo suffering from pain depr...,1
37820,126,020,u n i v e r s i t y d i p l o m a s do you wan...,1


In [21]:
# Check if preprocessed_files folder exists
# Create the folder if it does not exist
if not os.path.exists("preprocessed_files"):
    os.makedirs("preprocessed_files")

# Save df_main as preprocessed_emails_f.csv
df_main.to_csv("preprocessed_files/preprocessed_emails_f.csv", index=False)

# Reset the main dataframe, to not overload the memory
df_main.drop(df_main.index, inplace=True)
df_main = pd.DataFrame(columns=["folder", "file", "email_message", "category"])
df_main # This should be an empty dataframe again

Unnamed: 0,folder,file,email_message,category
