In [1]:
import pandas as pd
import email
import os
from bs4 import BeautifulSoup
import re
import base64
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

In [2]:
# Data paths for extracting emails
data_path = "Data/Raw_data"
chinese_path = "trec06c"
english_path = "trec06p"
index_path = "full/index"
labels_ch = pd.read_csv(os.path.join(data_path,chinese_path,index_path), sep=" ", header=None)
labels_en = pd.read_csv(os.path.join(data_path,english_path,index_path), sep=" ", header=None)
labels_ch.columns = ["class", "subpath"]
labels_en.columns = ["class", "subpath"]

In [3]:
def html_remover(txt):
        """
        Removes html tags in a given text
        Input: doc
        Output: String
        """
        soup=BeautifulSoup(txt,'html.parser')
        a=soup.get_text()
        return a

def extract_message(row, lang):
    subpath = row["subpath"][3:]
    if lang=="en":
        full_path = os.path.join(data_path, english_path, subpath)
        encode_type = 'iso-8859-1'
    elif lang=="ch":
        full_path = os.path.join(data_path, chinese_path, subpath)
        encode_type = 'gb2312'
    try:
        with open(full_path, "r", encoding=encode_type) as f:
            email_content = f.read()
            parsed_email_content = email.message_from_string(email_content)
    except:
        return row
    new_encode_type = parsed_email_content.get_content_charset()
    if new_encode_type:
        new_encode_type = re.sub(r"charset|\"|=|'", "", new_encode_type)
        if new_encode_type != encode_type:
            try:
                with open(full_path, "r", encoding=new_encode_type) as f:
                    new_email_content = f.read()
                    parsed_email_content = email.message_from_string(new_email_content)
                    encode_type = new_encode_type
            except:
                parsed_email_content = email.message_from_string(email_content)
    try: 
        row["subject"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["subject"])))
        row["email_to"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["to"])))
        row["email_from"] = str(email.header.make_header(email.header.decode_header(parsed_email_content["from"])))
    except:
        return row
    row["contains_img"] = False
    row["base64"] = False
    if parsed_email_content.is_multipart():
        message = []
        for part in parsed_email_content.get_payload():
            part_payload = part.get_payload()
            payload_type = part.get_content_type()
            if payload_type in ("text/plain", "text/html"):
                if part["Content-Transfer-Encoding"] == "base64":
                    try:
                        part_payload = base64.b64decode(part_payload).decode(encode_type)
                        #print(subpath)
                        row["base64"] = True
                    except:
                        pass
                if payload_type == "text/html":
                    part_payload = html_remover(part_payload)
                message.append(part_payload)
            elif payload_type == "multipart/alternative":
                for sub_part in part_payload:
                    sub_part_payload = sub_part.get_payload()
                    sub_part_type = sub_part.get_content_type()
                    if sub_part_type in ("text/plain", "text/html"):
                        if part["Content-Transfer-Encoding"] == "base64":
                            try:
                                part_payload = base64.b64decode(part_payload).decode(encode_type)
                                #print(subpath)
                                row["base64"] = True
                            except:
                                pass
                        if payload_type == "text/html":
                            sub_part_payload = html_remover(sub_part_payload)
                        message.append(sub_part_payload)
            elif payload_type == "image/png": 
                row["contains_img"] = True
        row["message"] = " ".join(message)
    else:
        message = parsed_email_content.get_payload()
        if parsed_email_content["Content-Transfer-Encoding"] == "base64":
            try:
                message = base64.b64decode(message).decode(encode_type)
                #print(subpath)
                row["base64"] = True
            except:
                pass
        payload_type = parsed_email_content.get_content_type()
        if payload_type == "text/html":
            message = html_remover(message)
        row["message"] = message
    return row

extract_en = lambda row: extract_message(row, lang="en")
extract_ch = lambda row: extract_message(row, lang="ch")

In [4]:
# Run email parser
ch_emails = labels_ch.apply(extract_ch, axis=1)
en_emails = labels_en.apply(extract_en, axis=1)
ch_emails.dropna(inplace=True)
en_emails.dropna(inplace=True)
ch_emails["full_subpath"] = ch_emails.subpath.apply(lambda x: f"trec06c{x[2:]}")
en_emails["full_subpath"] = en_emails.subpath.apply(lambda x: f"trec06p{x[2:]}")
ch_emails.set_index("full_subpath", inplace=True)
en_emails.set_index("full_subpath", inplace=True)
ch_emails.to_pickle("Data/ch_emails_raw.pkl")
en_emails.to_pickle("Data/en_emails_raw.pkl")

In [5]:
# Keep only (full_subpath, message) for translation
ch_emails_messages = ch_emails[["message"]].copy()
en_emails_messages = en_emails[["message"]].copy()
all_messages = pd.concat([ch_emails_messages, en_emails_messages])
all_messages.drop_duplicates(subset="message", inplace=True)
all_messages = all_messages.sample(frac=1)

In [6]:
# Partition into parts for processing
nrows = len(all_messages)
first = all_messages[:nrows//4]
second = all_messages[nrows//4:2*nrows//4]
third = all_messages[2*nrows//4:3*nrows//4]
fourth = all_messages[3*nrows//4:]
first.to_pickle("Data/first.pkl")
second.to_pickle("Data/second.pkl")
third.to_pickle("Data/third.pkl")
fourth.to_pickle("Data/fourth.pkl")

# After processing

In [53]:
first = pd.read_json("first.json", lines=True)
first.set_index("doc_id", inplace=True)
second = pd.read_json("second.json", lines=True)
second.set_index("doc_id", inplace=True)
third = pd.read_json("third.json", lines=True)
third.set_index("doc_id", inplace=True)
fourth = pd.read_json("fourth.json", lines=True)
fourth.set_index("doc_id", inplace=True)
full_processed = pd.concat([first, second, third, fourth])
ch_emails = pd.read_pickle("Data/ch_emails_raw.pkl")
en_emails = pd.read_pickle("Data/en_emails_raw.pkl")
full = pd.concat([ch_emails, en_emails])

In [60]:
full_df = pd.merge(full_processed, full, left_index=True, right_index=True, how="left")
full_df.to_pickle("Data/full_df.pkl")