In [1]:
import sys
import os
import io

import pyfreeling

import os
from email.parser import Parser
import email.utils

import time

import re

In [2]:
basedir = "../maildir/lay-k"  # Change root dir to affect how many mails are touched

In [21]:
def messageIDtoSubject(mail_dict, messageID):
    return mail_dict[messageID]["subject"].replace(" ", "")


def raw_parse(inputfile, email_list):
    with open(inputfile, "r",encoding="utf-8",errors="ignore") as f:
        data = f.read()
    parsedEmail = Parser().parsestr(data)
    timestamp = time.mktime(email.utils.parsedate(parsedEmail["date"]))
    email_list.append((timestamp, parsedEmail))


def obtain_raw_threads(mail_dict, email_list):
    subject_dict = {}
    rootMailSet = set()
    auxiliarRootStructure = {}

    for mail in email_list:
        actualEmail = mail[1]
        true_subject = actualEmail['subject']
        true_subject = true_subject.replace(" ", "")
        mail_dict[actualEmail["message-id"]] = actualEmail
        if "Re:" != true_subject[0:3]:
            auxiliarRootStructure[true_subject] = actualEmail["message-id"]
            if true_subject not in subject_dict:
                subject_dict[true_subject] = []
        else:
            new_subject = true_subject.replace("Re:", "")
            if new_subject in subject_dict:
                subject_dict[new_subject].append(actualEmail["message-id"])
                rootMailSet.add(auxiliarRootStructure[new_subject])
            subject_dict[true_subject] = []
    threads = {}
    for mail in rootMailSet:
        subject = messageIDtoSubject(mail_dict, mail)
        childThread = subject_dict[subject]
        threads[mail] = childThread
    return threads


def preprocess_recipients(recipient):
    if recipient is not None:
        return re.sub(r'\s+', '', recipient).split(',')
    else:
        return None


def obtain_mails_features(email_list):
    email_dict = {}
    for num, mail in email_list:
        email_dict[mail['message-id']] = {
            'from': mail['from'],
            # 'to': re.sub(r'\s+', '', mail['to']).split(','),
            'to': preprocess_recipients(mail['to']),
            'date': mail['date'],
            'features': {}
        }
    return email_dict

In [4]:
mail_dict = {}
email_list = []

for directory, subdirectory, filenames in os.walk(basedir):
    for filename in filenames:
        raw_parse(os.path.join(directory, filename), email_list)
email_list.sort(key=lambda x: x[0])
pureThreads = obtain_raw_threads(mail_dict, email_list)

In [25]:
obtain_mails_features(email_list)

{'<3823157.1075840203657.JavaMail.evans@thyme>': {'from': 'tori.wells@enron.com',
  'to': ['maxwells@train.missouri.org'],
  'date': 'Mon, 31 Dec 1979 16:00:00 -0800 (PST)',
  'features': {}},
 '<6457299.1075840201254.JavaMail.evans@thyme>': {'from': 'rosalee.fleming@enron.com',
  'to': ['rodolfo.acevedo@enron.com',
   'lisa.alderman@enron.com',
   'teri.alexander@enron.com',
   'lauri.allen@enron.com',
   'sherry.anastas@enron.com',
   'alan.aronowitz@enron.com',
   'bani.arora@enron.com',
   'finney.attasseril@enron.com',
   'anthony.austin@enron.com',
   'richard.babin@enron.com',
   'natalie.baker@enron.com',
   'ann.ballard@enron.com',
   'david.bargainer@enron.com',
   'carolyn.barrett@enron.com',
   'cynthia.barrow@enron.com',
   'gilda.bartz@enron.com',
   'russell.baumbach@enron.com',
   'stacy.beisel@enron.com',
   'katherine.benedict@enron.com',
   'clarence.berg@enron.com',
   'mark.bernstein@enron.com',
   'michael.bilberry@enron.com',
   'donna.bily@enron.com',
   'anthon

In [19]:
for num, mail in email_list:
    if mail['To'] is None:
        print(mail['message-id'], mail['To'])

<12804123.1075840201286.JavaMail.evans@thyme> None
<8845078.1075840230864.JavaMail.evans@thyme> None
<28192848.1075840255559.JavaMail.evans@thyme> None
<14329510.1075840201313.JavaMail.evans@thyme> None
<13097507.1075840230888.JavaMail.evans@thyme> None
<15070162.1075840255583.JavaMail.evans@thyme> None
<14116828.1075840201487.JavaMail.evans@thyme> None
<13246582.1075840231066.JavaMail.evans@thyme> None
<32908873.1075840255764.JavaMail.evans@thyme> None
<27359015.1075840205917.JavaMail.evans@thyme> None
<4608447.1075840257106.JavaMail.evans@thyme> None
<5437874.1075840233434.JavaMail.evans@thyme> None
<28048744.1075840206473.JavaMail.evans@thyme> None
<30252646.1075840257604.JavaMail.evans@thyme> None
<31087098.1075840233951.JavaMail.evans@thyme> None
<9034650.1075840206496.JavaMail.evans@thyme> None
<5920290.1075840257628.JavaMail.evans@thyme> None
<4799333.1075840233974.JavaMail.evans@thyme> None
<20832957.1075840206674.JavaMail.evans@thyme> None
<31470479.1075840257786.JavaMail.evan

In [18]:
for num, mail in email_list:
    if not isinstance(mail['To'], str):
        print(mail['message-id'], mail['To'])

<12804123.1075840201286.JavaMail.evans@thyme> None
<8845078.1075840230864.JavaMail.evans@thyme> None
<28192848.1075840255559.JavaMail.evans@thyme> None
<14329510.1075840201313.JavaMail.evans@thyme> None
<13097507.1075840230888.JavaMail.evans@thyme> None
<15070162.1075840255583.JavaMail.evans@thyme> None
<14116828.1075840201487.JavaMail.evans@thyme> None
<13246582.1075840231066.JavaMail.evans@thyme> None
<32908873.1075840255764.JavaMail.evans@thyme> None
<27359015.1075840205917.JavaMail.evans@thyme> None
<4608447.1075840257106.JavaMail.evans@thyme> None
<5437874.1075840233434.JavaMail.evans@thyme> None
<28048744.1075840206473.JavaMail.evans@thyme> None
<30252646.1075840257604.JavaMail.evans@thyme> None
<31087098.1075840233951.JavaMail.evans@thyme> None
<9034650.1075840206496.JavaMail.evans@thyme> None
<5920290.1075840257628.JavaMail.evans@thyme> None
<4799333.1075840233974.JavaMail.evans@thyme> None
<20832957.1075840206674.JavaMail.evans@thyme> None
<31470479.1075840257786.JavaMail.evan

In [None]:
obtain_mails_features(email_list)

In [None]:
print(mail)

In [None]:
mail['Cc']

In [22]:
email_dict = {}
for num, mail in email_list[:5]:
    
    email_dict[mail['message-id']] = {
        'from': mail['from'],
        'to': prep_user_mails(mail['to']),
        'date': mail['date'],
        'features': {}
    }

NameError: name 'prep_user_mails' is not defined

In [None]:
print(email_dict)