In [1]:
import email
import pickle as pkl
import re

import numpy as np
import pandas as pd

from collections import Counter
from datetime import datetime, timedelta
from dateutil.parser import parse
from pathlib import Path

In [2]:
email_dir = Path(Path.cwd().parent, Path('data/raw/maildir'))
data_dir = Path(Path.cwd().parent, Path('data/interim'))


def parse_emails(path):
    with open(path, 'r', encoding='windows-1252') as f:
        parsed_email = email.message_from_file(f)
    return parsed_email


def get_parsed_emails(paths):
    emails = []
    for i, path in enumerate(paths):
        eml = parse_emails(path)
        tms = int(parse(eml['Date']).timestamp())
        emails.append((i, eml, tms))
    return emails


def get_parsed_emails(paths, dic=None):
    for path in paths:
        eml = parse_emails(path)
        temp = {k:v for k, v in eml.items() + [('Message', eml.get_payload()), ('Timestamp', int(parse(eml['Date']).timestamp()))]}
        for k,v in dic.items():
            dic[k].append(temp.get(k))
    return dic


In [3]:
def remove_spaces(string):
    if string is not None:        
        string = re.sub('\s+', ' ', string)
        string = string.split(', ')
    return string


def get_chain(data):
    df_ = data.loc[:, ['key', 'Timestamp', 'Reply']].sort_values(by=['key', 'Reply', 'Timestamp'])
    chains = {}
    counter = 0
    for idx, row in df_.iterrows():
        key = f"{row['key']}_{counter:03d}"
        if key not in chains:
            counter = 0
            key = f"{row['key']}_{counter:03d}"
            chains[key] = {'length': 1, 'email_ids': [idx]}
        else:
            if row['Reply']:
                chains[key]['length'] += 1
                chains[key]['email_ids'].append(idx)
            else:
                counter += 1
                key = f"{row['key']}_{counter:03d}"
                chains[key] = {'length': 1, 'email_ids': [idx]}

    chains_new = {}
    for i, row in enumerate(chains):
        chains_new[i] = {'chain_id': row} | chains[row]

    return chains_new


def get_longest_chain(chain):
    MAX = 0
    longest_chain = []
    for k, v in chain.items():
        if v['length'] > MAX:
            MAX = v['length']
            longest_chain = (k, v['length'], v['email_ids'])
        elif v['length'] == MAX:
            if isinstance(longest_chain, tuple):
                longest_chain = [longest_chain, (k, v['length'] ,v['email_ids'])]
            else:
                longest_chain.append((k, v['length'] ,v['email_ids']))
        
    return longest_chain


def assign_chain_id(data, chain):
    data.loc[:,'Chain'] = None
    data.loc[:,'Chain_len'] = None
    for k,v in chain.items():
        data.loc[v['email_ids'], 'Chain'] = k
        data.loc[v['email_ids'], 'Chain_len'] = v['length']
        

In [4]:
%%time
parsed_emails_file = Path(data_dir, 'parsed_emails_chains_all.pkl')

if parsed_emails_file.is_file():
    df = pd.read_pickle(parsed_emails_file)
else:
    clean_emails = []
    for path in email_dir.rglob('*.'):
        if 'all_documents' not in str(path.parent) and 'discussion_threads' not in str(path.parent):
            clean_emails.append(path)
    email_dict = {
        'Message-ID': [],
        'Date': [],
        'From': [],
        'To': [],
        'Subject': [],
        'Cc': [],
        'Mime-Version': [],
        'Content-Type': [],
        'Content-Transfer-Encoding': [],
        'Bcc': [],
        'X-From': [],
        'X-To': [],
        'X-cc': [],
        'X-bcc': [],
        'X-Folder': [],
        'X-Origin': [],
        'X-FileName': [],
        'Message': [],
        'Timestamp': []
    }
    df = pd.DataFrame(get_parsed_emails(clean_emails, email_dict))
    df = df.loc[:, ['Subject', 'From', 'Message','To','Cc','Bcc','Timestamp']].drop_duplicates()
    df = df.sort_values(by='Timestamp').reset_index()
    df['to'] = df['To'].apply(lambda x: [] if x is None else remove_spaces(x))
    df['cc'] = df['Cc'].apply(lambda x: [] if x is None else remove_spaces(x))
    df['bcc'] = df['Bcc'].apply(lambda x: [] if x is None else remove_spaces(x))
    df['users'] = df['From'].apply(lambda x: [x]) + df['to'] + df['cc'] + df['bcc']
    df['users'] = df['users'].apply(lambda x: sorted(x))
    df['subject'] = df['Subject'].str.replace('(re\W*:|FW\W+|FWD\W*:)', '', regex=True, flags=re.IGNORECASE).str.strip()
    df['subject'] = df['subject'].apply(lambda x: ['None'] if x == '' else [x])
    df['key'] = df['subject'] + df['users']
    df['key'] = df['key'].agg('_'.join)
    msg_end_pattern = re.compile('_{4,}.*|\n{3,}|<[^>]*>|-{4,}(.*)(\d{2}:\d{2}:\d{2})\s*(PM|AM)', re.MULTILINE)
    df['Message'] = df['Message'].str.replace(msg_end_pattern, '', regex=True)
    df['Message'] = df['Message'].str.replace('(?=-----Original Message-----)(?s)(.*$)', '', regex=True)
    df['Message'] = df['Message'].replace(r'^\s*$', np.nan, regex=True)
    df['Reply'] = False
    df.loc[df['Subject'].str.contains('(re\W*:)', flags=re.IGNORECASE, regex=True),'Reply'] = True
    chain_rlx = get_chain(df)
    assign_chain_id(df, chain_rlx)
    df.to_pickle(parsed_emails_file)




CPU times: user 3min 6s, sys: 4.06 s, total: 3min 10s
Wall time: 3min 9s


In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251068 entries, 0 to 251067
Data columns (total 17 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   index      251068 non-null  int64 
 1   Subject    251068 non-null  object
 2   From       251068 non-null  object
 3   Message    243822 non-null  object
 4   To         241721 non-null  object
 5   Cc         58853 non-null   object
 6   Bcc        58853 non-null   object
 7   Timestamp  251068 non-null  int64 
 8   to         251068 non-null  object
 9   cc         251068 non-null  object
 10  bcc        251068 non-null  object
 11  users      251068 non-null  object
 12  subject    251068 non-null  object
 13  key        251068 non-null  object
 14  Reply      251068 non-null  bool  
 15  Chain      251068 non-null  object
 16  Chain_len  251068 non-null  object
dtypes: bool(1), int64(2), object(14)
memory usage: 648.8 MB


In [6]:
# Replied email
emails_file = Path(data_dir, 'parsed_emails_chain_replies.pkl')
if emails_file.is_file():
    df_re = pd.read_pickle(emails_file)
else:
    chains_reply = df.loc[df['Reply'], 'Chain'].dropna().unique()
    df_re = df.loc[df['Chain'].isin(chains_reply)]
    df_re.to_pickle(emails_file)

In [10]:
df.columns

Index(['index', 'Subject', 'From', 'Message', 'To', 'Cc', 'Bcc', 'Timestamp',
       'to', 'cc', 'bcc', 'users', 'subject', 'key', 'Reply', 'Chain',
       'Chain_len'],
      dtype='object')

In [7]:
df.loc[[218584, 229391, 231772, 231855, 251066]]

Unnamed: 0,index,Subject,From,Message,To,Cc,Bcc,Timestamp,to,cc,bcc,users,subject,key,Reply,Chain,Chain_len
218584,98145,Trades,cramer@cadvision.com,"\nMonday dec 17 \n2002 traded 40, 40.25 and 40...",john.zufferli@enron.com,,,1008957019,[john.zufferli@enron.com],[],[],"[cramer@cadvision.com, john.zufferli@enron.com]",[Trades],Trades_cramer@cadvision.com_john.zufferli@enro...,False,187752,1
229391,98151,Trades,cramer@cadvision.com,\nBom traded 31 for 100 MW across the morning ...,john.zufferli@enron.com,,,1011221228,[john.zufferli@enron.com],[],[],"[cramer@cadvision.com, john.zufferli@enron.com]",[Trades],Trades_cramer@cadvision.com_john.zufferli@enro...,False,187753,1
231772,97768,Trades,john.zufferli@enron.com,Can you send me the Jan 18 trades and marks as...,cramer@cadvision.com,,,1011721889,[cramer@cadvision.com],[],[],"[cramer@cadvision.com, john.zufferli@enron.com]",[Trades],Trades_cramer@cadvision.com_john.zufferli@enro...,False,187754,1
231855,98146,Re: Trades,cramer@cadvision.com,"Friday Jan 18 ,bom traded 29\nfeb traded away ...",john.zufferli@enron.com,,,1011727673,[john.zufferli@enron.com],[],[],"[cramer@cadvision.com, john.zufferli@enron.com]",[Trades],Trades_cramer@cadvision.com_john.zufferli@enro...,True,187755,2
251066,98158,Trades,cramer@cadvision.com,"\nBOM 5th to 31st traded 34, 33.5 , 33.5 and ...",john.zufferli@enron.com,,,2335539586,[john.zufferli@enron.com],[],[],"[cramer@cadvision.com, john.zufferli@enron.com]",[Trades],Trades_cramer@cadvision.com_john.zufferli@enro...,False,187755,2


In [8]:
df.sort_values('Chain_len', ascending=False)

Unnamed: 0,index,Subject,From,Message,To,Cc,Bcc,Timestamp,to,cc,bcc,users,subject,key,Reply,Chain,Chain_len
204542,135713,RE:,michelle.nelson@enron.com,i am? so i am freaking out about enron again....,mike.maggi@enron.com,,,1006355200,[mike.maggi@enron.com],[],[],"[michelle.nelson@enron.com, mike.maggi@enron.com]",[None],None_michelle.nelson@enron.com_mike.maggi@enro...,True,131203,798
203051,135813,RE:,michelle.nelson@enron.com,why would i want it if it's crap?\n\n,mike.maggi@enron.com,,,1006278119,[mike.maggi@enron.com],[],[],"[michelle.nelson@enron.com, mike.maggi@enron.com]",[None],None_michelle.nelson@enron.com_mike.maggi@enro...,True,131203,798
203186,136574,RE:,michelle.nelson@enron.com,:) i cna't be calm. it's just not possible si...,mike.maggi@enron.com,,,1006281302,[mike.maggi@enron.com],[],[],"[michelle.nelson@enron.com, mike.maggi@enron.com]",[None],None_michelle.nelson@enron.com_mike.maggi@enro...,True,131203,798
203189,134995,RE:,mike.maggi@enron.com,i dont like the sir thing\n\n,michelle.nelson@enron.com,,,1006281378,[michelle.nelson@enron.com],[],[],"[michelle.nelson@enron.com, mike.maggi@enron.com]",[None],None_michelle.nelson@enron.com_mike.maggi@enro...,True,131203,798
203191,135011,RE:,michelle.nelson@enron.com,what's wrong with it sir?\n\n,mike.maggi@enron.com,,,1006281405,[mike.maggi@enron.com],[],[],"[michelle.nelson@enron.com, mike.maggi@enron.com]",[None],None_michelle.nelson@enron.com_mike.maggi@enro...,True,131203,798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91905,18812,RTO I/A terms can supersede current I/A,christi.nicolay@enron.com,"On 3/14/01, FERC upheld a provision in a utili...","jeff.brown@enron.com, ron.mcnamara@enron.com, ...",,,985269780,"[jeff.brown@enron.com, ron.mcnamara@enron.com,...",[],[],"[alan.comnes@enron.com, aleck.dadson@enron.com...",[RTO I/A terms can supersede current I/A],RTO I/A terms can supersede current I/A_alan.c...,False,152563,1
91906,131667,RE: Killer Bees Dinner,rick.buy@enron.com,thats really below the belt\n\n,"jeffrey.mcmahon@enron.com, mark.koenig@enron.com",,,985269973,"[jeffrey.mcmahon@enron.com, mark.koenig@enron....",[],[],"[jeffrey.mcmahon@enron.com, mark.koenig@enron....",[Killer Bees Dinner],Killer Bees Dinner_jeffrey.mcmahon@enron.com_m...,True,97768,1
91907,18336,Protest to Amendment 36 Compliance Filing,rcarroll@bracepatt.com,This too has to be filed today. Please give m...,"jhartso@enron.com, mary.hain@enron.com, smara@...",,,985270320,"[jhartso@enron.com, mary.hain@enron.com, smara...",[],[],"[jhartso@enron.com, mary.hain@enron.com, rcarr...",[Protest to Amendment 36 Compliance Filing],Protest to Amendment 36 Compliance Filing_jhar...,False,149486,1
91912,210223,Re: New Albany Plant - revised,dan.hyvl@enron.com,"Jim,\n I will work on the Transaction Agreemen...",jim.homco@enron.com,,,985270440,[jim.homco@enron.com],[],[],"[dan.hyvl@enron.com, jim.homco@enron.com]",[New Albany Plant - revised],New Albany Plant - revised_dan.hyvl@enron.com_...,True,117948,1


In [None]:
# Not Original message, which is a response to either internal or external email
emails_file = Path(data_dir, 'parsed_emails_not_original.pkl')

if emails_file.is_file():
    df_no = pd.read_pickle(emails_file)
else:
    df_no = df.loc[df['Message'].str.contains('-----Original Message-----'),:]
    df_no.to_pickle(emails_file)

In [None]:
# Replied email
emails_file = Path(data_dir, 'parsed_emails_replies.pkl')
if emails_file.is_file():
    df_re = pd.read_pickle(emails_file)
else:
    df_re = df.loc[df['Subject'].str.contains('(re\W*:)', flags=re.IGNORECASE, regex=True),:]
    df_re['Subject'] = df['Subject'].str.strip()
    df_re.to_pickle(emails_file)

In [None]:
%%time
msg_end_pattern = re.compile('_{4,}.*|\n{3,}|<[^>]*>|-{4,}(.*)(\d{2}:\d{2}:\d{2})\s*(PM|AM)', re.MULTILINE)
df['Message'] = df['Message'].str.replace(msg_end_pattern, '', regex=True)
df['Message'] = df['Message'].str.replace('(?=-----Original Message-----)(?s)(.*$)', '', regex=True)
df['Message'] = df['Message'].replace(r'^\s*$', np.nan, regex=True)

In [None]:
# Forwarded message, which can be written as FW or FWD
emails_file = Path(data_dir, 'parsed_emails_forward.pkl')

if emails_file.is_file():
    df_fw = pd.read_pickle(emails_file)
else:
    df_fw = df.loc[df['Subject'].str.contains('FW\W+|FWD\W*:', flags=re.IGNORECASE, regex=True),:]
    df_fw.to_pickle(emails_file)

In [None]:
# Chain starts, which are assumed to be some mails that are not forwarded, nor replied nor not-original
emails_file = Path(data_dir, 'parsed_emails_chain_fathers.pkl')

if emails_file.is_file():
    df_fa = pd.read_pickle(emails_file)
else:
    df_fa = df[(~df.index.isin(df_no.index))&(~df.index.isin(df_re.index))&(~df.index.isin(df_fw.index))]
    df_fa.to_pickle(emails_file)

# df_fa.info(memory_usage='deep')

In [None]:
# Chain starts, which are assumed to be some mails that are not forwarded, nor replied nor not-original
emails_file = Path(data_dir, 'parsed_emails_chain_children.pkl')

if emails_file.is_file():
    df_so = pd.read_pickle(emails_file)
else:
    df_so = df[(df.index.isin(df_no.index))|(df.index.isin(df_re.index))|(df.index.isin(df_fw.index))]
    df_so.to_pickle(emails_file)
    
# df_so.info(memory_usage='deep')

In [None]:
subjects = df_so['Subject'].str.replace('(re\W*:|FW\W+|FWD\W*:)','', regex=True ,flags=re.IGNORECASE).str.strip()

In [None]:
subjects_fa = df_fa['Subject'].str.strip()

In [None]:
# Subjects that are candidates to be chain
subjects_ch = subjects_so[subjects_so.isin(subjects_fa)]

Assumption for chains:

* Strict: an email chain has all the users involved
* Relaxed: from all the users, only the ones that where direct to and from users are considered chain
* Emails without a recipient are not considered as a chain

In [None]:
# Strict

In [None]:


def preprocess_message(text):
    msg_end_pattern = re.compile('_{4,}.*|\n{3,}|<[^>]*>|-{4,}(.*)(\d{2}:\d{2}:\d{2})\s*(PM|AM)', re.MULTILINE)
    try:
        msg_end_iter = msg_end_pattern.search(text).start()
        # print('end of line:', msg_end_iter)
        message = text[:msg_end_iter]
    except AttributeError: # not a reply
        message = text
    return message


def remove_spaces(string):
    if string is not None:        
        string = re.sub('\s+', ' ', string)
        string = string.split(', ')
    return string



#check to see if an email is forwarded by looking for 'fw' and 'fwd' tags
def is_forwarded(email):
    is_forwarded = False
    check_values = ['fw:', 'fwd:']
    for value in check_values:
        if value in email.get_payload().lower() or value in email['Subject'].lower():
            is_forwarded = True
    return is_forwarded

In [None]:
emails = parsed_emails

In [None]:
email_dir = Path(Path.cwd().parent, Path('data/raw/maildir'))
data_dir = Path(Path.cwd().parent, Path('data/interim'))


def parse_emails(path):
    with open(path, 'r', encoding='windows-1252') as f:
        parsed_email = email.message_from_file(f)
    return parsed_email


def get_parsed_emails(paths):
    emails = []
    for i, path in enumerate(paths):
        eml = parse_emails(path)
        tms = int(parse(eml['Date']).timestamp())
        emails.append((i, eml, tms))
    return emails


def get_parsed_emails(paths, dic=None):
    for path in paths:
        eml = parse_emails(path)
        temp = {k:v for k, v in eml.items() + [('Message', eml.get_payload()), ('Timestamp', int(parse(eml['Date']).timestamp()))]}
        for k,v in dic.items():
            dic[k].append(temp.get(k))
    return dic


In [None]:
%%time
parsed_emails_file = Path(data_dir, 'parsed_emails_no_duplicates.pkl')

if parsed_emails_file.is_file():
    df = pd.read_pickle(parsed_emails_file)
else:
    clean_emails = []
    for path in email_dir.rglob('*.'):
        # if 'all_documents' not in str(path.parent) and 'discussion_threads' not in str(path.parent):
        clean_emails.append(path)
    email_dict = {
        'Message-ID': [],
        'Date': [],
        'From': [],
        'To': [],
        'Subject': [],
        'Cc': [],
        'Mime-Version': [],
        'Content-Type': [],
        'Content-Transfer-Encoding': [],
        'Bcc': [],
        'X-From': [],
        'X-To': [],
        'X-cc': [],
        'X-bcc': [],
        'X-Folder': [],
        'X-Origin': [],
        'X-FileName': [],
        'Message': [],
        'Timestamp': []
    }
    df = pd.DataFrame(get_parsed_emails(clean_emails, email_dict))
    df = df.loc[:,['Subject', 'From', 'Message','To','Cc','Bcc','Timestamp']].drop_duplicates()
    df = df.sort_values(by='Timestamp').reset_index()
    df.to_pickle(parsed_emails_file)



In [None]:
def remove_spaces(string):
    if string is not None:        
        string = re.sub('\s+', ' ', string)
        string = string.split(', ')
    return string


def get_chain(data):
    chain = {}
    for idx, row in data.iterrows():
        if len(row['key'].split('_')) <= 2:
            continue
        if row['key'] in chain:
            chain[row['key']]['length'] += 1
            chain[row['key']]['email_ids'].append(idx)
        else:
            chain[row['key']] = {}
            chain[row['key']]['length'] = 1
            chain[row['key']]['email_ids'] = [idx]
    
    chain_new = {}
    for i, row in enumerate(chain):
        chain_new[i] = {'chain_id': row} | chain[row]
    
    return chain_new


def get_longest_chain(chain):
    MAX = 0
    longest_chain = []
    for k, v in chain.items():
        if v['length'] > MAX:
            MAX = v['length']
            longest_chain = (k, v['length'], v['email_ids'])
        elif v['length'] == MAX:
            if isinstance(longest_chain, tuple):
                longest_chain = [longest_chain, (k, v['length'] ,v['email_ids'])]
            else:
                longest_chain.append((k, v['length'] ,v['email_ids']))
        
    return longest_chain


def assign_chain_id(data, chain):
    data.loc[:,'Chain'] = None
    data.loc[:,'Chain_len'] = None
    for k,v in chain.items():
        data.loc[v['email_ids'], 'Chain'] = k
        data.loc[v['email_ids'], 'Chain_len'] = v['length']
        