In [2]:
import email
import re

from pathlib import Path
from collections import Counter

In [3]:
email_dir = Path(Path.cwd().parent, Path('data/raw/maildir'))

In [7]:
def get_emails_list(dir_path):
    clean_emails = []
    for path in dir_path.rglob('*.'):
        if 'all_documents' not in str(path.parent) and 'discussion_threads' not in str(path.parent):
            clean_emails.append(path)
    return clean_emails


def parse_emails(path):
    with open(path, 'r', encoding='windows-1252') as f:
        parsed_email = email.message_from_file(f)
    return parsed_email


def load_emails(dir_path):
    parsed_emails = []
    clean_emails = get_emails_list(dir_path)
    for i, path in enumerate(clean_emails):
        parsed_emails.append((i,parse_emails(path)))
    return parsed_emails


def preprocess_message(text):
    msg_end_pattern = re.compile('_{4,}.*|\n{3,}|<[^>]*>|-{4,}(.*)(\d{2}:\d{2}:\d{2})\s*(PM|AM)', re.MULTILINE)
    try:
        msg_end_iter = msg_end_pattern.search(text).start()
        # print('end of line:', msg_end_iter)
        message = text[:msg_end_iter]
    except AttributeError: # not a reply
        message = text
    return message

In [8]:
parsed_emails = load_emails(email_dir)

In [10]:
lengths = [(text[0], text[1]['message-id'],len(text[1].get_payload())) for text in parsed_emails]
large_emails = sorted(lengths, key=lambda l: l[2], reverse=True)[:10]
large_emails

[(283840, '<24675364.1075840403182.JavaMail.evans@thyme>', 2011422),
 (258598, '<404977.1075840712968.JavaMail.evans@thyme>', 1697165),
 (9281, '<30250907.1075852373160.JavaMail.evans@thyme>', 1621936),
 (211173, '<12090449.1075863678315.JavaMail.evans@thyme>', 1371385),
 (296093, '<13839356.1075862344682.JavaMail.evans@thyme>', 820853),
 (249493, '<9344275.1075845036063.JavaMail.evans@thyme>', 537014),
 (107812, '<28543449.1075841171541.JavaMail.evans@thyme>', 392883),
 (107383, '<30853644.1075841127218.JavaMail.evans@thyme>', 329948),
 (313851, '<5645767.1075852257668.JavaMail.evans@thyme>', 322266),
 (108000, '<3540098.1075841167699.JavaMail.evans@thyme>', 304544)]

In [19]:
lengths = [(text[0], text[1]['message-id'], len(preprocess_message(text[1].get_payload()))) for text in parsed_emails]
large_emails = sorted(lengths, key=lambda l: l[2], reverse=True)[:10]
large_emails

[(235016, '<26331558.1075841479333.JavaMail.evans@thyme>', 248665),
 (216429, '<12793391.1075840062316.JavaMail.evans@thyme>', 210700),
 (95399, '<16931220.1075843756579.JavaMail.evans@thyme>', 208766),
 (97025, '<10915312.1075843742937.JavaMail.evans@thyme>', 185780),
 (140462, '<1288782.1075840360512.JavaMail.evans@thyme>', 174478),
 (224255, '<26457604.1075840175257.JavaMail.evans@thyme>', 174478),
 (140323, '<17960685.1075840362627.JavaMail.evans@thyme>', 174230),
 (140366, '<9576844.1075840364730.JavaMail.evans@thyme>', 173427),
 (224471, '<5953458.1075840189268.JavaMail.evans@thyme>', 173427),
 (295160, '<23957807.1075860995169.JavaMail.evans@thyme>', 85504)]