In [1]:
import pandas as pd
import os,sys,email,re
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')
import codecs

In [2]:
email_data = pd.read_csv('Enron_data/email_data_sample_100.csv')

In [3]:
email_data.columns

Index(['Unnamed: 0', 'file', 'message'], dtype='object')

In [4]:
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [5]:
messages = list(map(email.message_from_string, email_data['message']))
email_data.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    email_data[key] = [doc[key] for doc in messages]
# Parse content from emails
email_data['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
email_data['From'] = email_data['From'].map(split_email_addresses)
email_data['To'] = email_data['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
email_data['user'] = email_data['file'].map(lambda x:x.split('/')[0])
del messages

In [6]:
email_data.drop(columns='Unnamed: 0',axis=1,inplace=True)

In [7]:
email_data.columns

Index(['file', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Mime-Version',
       'Content-Type', 'Content-Transfer-Encoding', 'X-From', 'X-To', 'X-cc',
       'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'content', 'user'],
      dtype='object')

#### Removing headers & footers from email body

In [73]:
def generate_text(sentences, nlp, outfile, threshold=0.9):
    new_email=[]
    for sentence in sentences:
        if sentence.startswith('EDRM Enron Email Data Set has been produced in EML'):
           continue
        if _prob_block(sentence, nlp) < threshold:
            new_email.append(sentence)
    outfile.append(''.join(new_email))

                
def _prob_block(sentence, nlp):
    try:
        sentence = str(sentence)
        parsed_data = nlp(sentence)
        for span in parsed_data.sents:
            sent = [parsed_data[i] for i in range(span.start, span.end)]
        non_verbs = np.sum([token.pos_ != 'VERB' for token in sent])
        total = len(sent)
#         print(non_verbs,total)
        return float(non_verbs) / total
    except:
        return 0
    
    
def remove_foot_head(emails):
    outfile = []
    for text in emails:
        text_a = str(text)
        text_a = str(text_a).split('\r\n')
#         print(text_a)
        generate_text(text_a,nlp,outfile,.9)
    return outfile

In [74]:
cleansed_mail = remove_foot_head(email_data['content'].tolist())

#### comparing the results

In [90]:
print(email_data['content'][6])

Please cc the following distribution list with updates:

Phillip Allen (pallen@enron.com)
Mike Grigsby (mike.grigsby@enron.com)
Keith Holst (kholst@enron.com)
Monique Sanchez
Frank Ermis
John Lavorato


Thank you for your help

Phillip Allen



In [89]:
print(cleansed_mail[6])

Please cc the following distribution list with updates:Thank you for your help
