# Preprocessing Enron Emails

## Tasks

1. Import emails
2. Parse them to email format
3. Keep only original email
4. Remove html noise, detect chains
5. Export emails to pickle file

In [3]:
import email
import pickle as pkl
import re

from collections import Counter
from dateutil.parser import parse
from pathlib import Path

## Extraction

### Import emails

In [4]:
email_dir = Path(Path.cwd().parent, Path('data/raw/maildir'))
data_dir = Path(Path.cwd().parent, Path('data/interim'))

clean_emails = []
for path in email_dir.rglob('*.'):
    if 'all_documents' not in str(path.parent) and 'discussion_threads' not in str(path.parent):
        clean_emails.append(path)


def parse_emails(path):
    with open(path, 'r', encoding='windows-1252') as f:
        parsed_email = email.message_from_file(f)
    return parsed_email


def get_parsed_emails(paths):
    emails = []
    for i, path in enumerate(paths):
        eml = parse_emails(path)
        tms = int(parse(eml['Date']).timestamp())
        emails.append((i, eml, tms))
    return emails


parsed_emails = get_parsed_emails(clean_emails)

## Transformation


In [5]:
#check to see if an email is part of chain by looking for the '-----Original Message-----' tag
#addtionally count the number of times the tag appears to get the depth of the chain
def is_chain(payload):
    chain_count = 0
    is_chain = False
    check_value = '-----Original Message-----'
    if check_value in payload:
        chain_count = payload.count(check_value)
        is_chain = True
    return (chain_count, is_chain)


#check to see if an email is forwarded by looking for 'fw' and 'fwd' tags
def is_forwarded(email):
    is_forwarded = False
    check_values = ['fw:', 'fwd:']
    for value in check_values:
        if value in email.get_payload().lower() or value in email['Subject'].lower():
            is_forwarded = True
    return is_forwarded


#get only the names from the x-headers in the e-mail
def parseXHeaders(header):
    if header is not None:
        return re.sub(r'(<.*?>,|<.*?>)', '|', header)[:-1]
    else:
        return header 


def get_emails_list(dir_path):
    clean_emails = []
    for path in dir_path.rglob('*.'):
        if 'all_documents' not in str(path.parent) and 'discussion_threads' not in str(path.parent):
            clean_emails.append(path)
    return clean_emails


def parse_emails(path):
    with open(path, 'r', encoding='windows-1252') as f:
        parsed_email = email.message_from_file(f)
    return parsed_email


def load_emails(dir_path):
    parsed_emails = []
    clean_emails = get_emails_list(dir_path)
    for i, path in enumerate(clean_emails):
        parsed_emails.append((i,parse_emails(path)))
    return parsed_emails


def preprocess_message(text):
    msg_end_pattern = re.compile('_{4,}.*|\n{3,}|<[^>]*>|-{4,}(.*)(\d{2}:\d{2}:\d{2})\s*(PM|AM)', re.MULTILINE)
    try:
        msg_end_iter = msg_end_pattern.search(text).start()
        # print('end of line:', msg_end_iter)
        message = text[:msg_end_iter]
    except AttributeError: # not a reply
        message = text
    return message


def remove_spaces(string):
    if string is not None:        
        string = re.sub('\s+', ' ', string)
        string = string.split(', ')
    return string


#
# Function: get_or_allocated_uid
# Arguments: name - string of a user email
# Returns: unique integer id
#
def get_or_allocate_uid(name):
     if name not in users:
         users[name] = len(users)
     return users[name]

#
# Function: get_or_allocate_tid
# Arguments: name - string of email subject line
# Returns: unique integer id
#
def get_or_allocate_tid(name):
    parsed_name = re.sub("(RE|Re|FWD|Fwd): ", "", name)
    if parsed_name not in threads:
        threads[parsed_name] = len(threads)
    return threads[parsed_name]

In [None]:
# time = [(email[0], email[1]['Date']) for email in parsed_emails]
# subjects = [(email[0], email[1]['Subject']) for email in parsed_emails]
# recipients = [(email[0], remove_spaces(email[1]['To'])) for email in parsed_emails]
# senders = [(email[0], email[1]['From']) for email in parsed_emails]
# ccs = [(email[0], remove_spaces(email[1]['cc'])) for email in parsed_emails]
# bccs = [(email[0], remove_spaces(email[1]['bcc'])) for email in parsed_emails]
# # idx = [(email[0], email[0]) for email in parsed_emails]
# timestamps = [(email[0], email[2]) for email in parsed_emails]
# body = [(email[0], email[1].get_payload()) for email in parsed_emails]
# message = [(email[0], preprocess_message(email[1].get_payload()), email[2]) for email in parsed_emails]
# fwds = [(email[0], is_forwarded(email[1])) for email in parsed_emails]
# chains = [(email[0], is_chain(email[1].get_payload())) for email in parsed_emails]

In [None]:
# feeds = []
# users = {}
# threads = {}
# thread_users = {}
# user_threads = {}
# threads = {}
# thread_id = []
# for sbj in subject:
#     thread_id.append(get_or_allocate_tid(sbj))

## Loading


In [9]:
def export_emails(parsed_emails):
    try:
        with open(Path(data_dir,'time.pkl'), 'wb') as handle:
            pkl.dump([(email[0], email[1]['Date']) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'subjects.pkl'), 'wb') as handle:
            pkl.dump([(email[0], email[1]['Subject']) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'recipients.pkl'), 'wb') as handle:
            pkl.dump([(email[0], remove_spaces(email[1]['To'])) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'senders.pkl'), 'wb') as handle:
            pkl.dump([(email[0], email[1]['From']) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'ccs.pkl'), 'wb') as handle:
            pkl.dump([(email[0], remove_spaces(email[1]['cc'])) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'bbcs.pkl'), 'wb') as handle:
            pkl.dump([(email[0], remove_spaces(email[1]['bcc'])) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'body.pkl'), 'wb') as handle:
            pkl.dump([(email[0], email[1].get_payload()) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'message.pkl'), 'wb') as handle:
            pkl.dump([(email[0], preprocess_message(email[1].get_payload()), email[2]) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'fwds.pkl'), 'wb') as handle:
            pkl.dump([(email[0], is_forwarded(email[1])) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
        with open(Path(data_dir,'chains.pkl'), 'wb') as handle:
            pkl.dump([(email[0], is_chain(email[1].get_payload())) for email in parsed_emails], handle, protocol=pkl.HIGHEST_PROTOCOL)
    except Exception as e:
        print("caught", repr(e))

In [10]:
%%time
export_emails(parsed_emails)

CPU times: user 42.9 s, sys: 502 ms, total: 43.4 s
Wall time: 43.4 s


In [13]:
 !ls -l ../data/interim

total 987332
-rw-r--r-- 1 miguel miguel  11125929 Aug 29 06:31 bbcs.pkl
-rw-r--r-- 1 miguel miguel 638596963 Aug 29 06:31 body.pkl
-rw-r--r-- 1 miguel miguel  11125929 Aug 29 06:31 ccs.pkl
-rw-r--r-- 1 miguel miguel   3838138 Aug 29 06:31 chains.pkl
-rw-r--r-- 1 miguel miguel   2515202 Aug 29 06:31 fwds.pkl
-rw-r--r-- 1 miguel miguel 229120164 Aug 29 06:31 message.pkl
-rw-r--r-- 1 miguel miguel  52014250 Aug 29 06:31 recipients.pkl
-rw-r--r-- 1 miguel miguel  10602343 Aug 29 06:31 senders.pkl
-rw-r--r-- 1 miguel miguel  24004915 May  1 01:35 split_emails.csv
-rw-r--r-- 1 miguel miguel  12727057 Aug 29 06:31 subjects.pkl
-rw-r--r-- 1 miguel miguel  15324139 Aug 29 06:31 time.pkl
