In [2]:
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import TimeDistributed
from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from pylab import rcParams
import string
%matplotlib inline
from nltk.stem.wordnet import WordNetLemmatizer
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 12, 5

Using TensorFlow backend.


In [3]:
import pandas as pd
import email, re

from google.colab import drive
drive.mount('/content/drive')

In [4]:
emails_df = pd.read_csv('emails.csv', error_bad_lines = False)
print(emails_df.shape, emails_df.shape)

(517401, 2) (517401, 2)


In [5]:
emails_df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [6]:
#Helper functions

def get_email_text(email):
    '''To get the content from email objects'''
    email_contents = []
    for email_object in email.walk():
        if email_object.get_content_type() == 'text/plain':
            email_contents.append(email_object.get_payload())
    return ''.join(email_contents)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [7]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)

# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]

# Parse content from emails
emails_df['content'] = list(map(get_email_text, messages))


#emails_df['content'].to_csv('content.csv', index = False, header = True)
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Set index and drop columns with two few values
emails_df = emails_df.set_index('Message-ID')\
    .drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)

# Parse datetime
emails_df['Date'] = pd.to_datetime(emails_df['Date'], infer_datetime_format=True)
print(emails_df.shape)
emails_df.head()

(517401, 12)


Unnamed: 0_level_0,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content
Message-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 16:39:00-07:00,(phillip.allen@enron.com),(tim.belden@enron.com),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n
<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 13:51:00-07:00,(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...
<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 03:00:00-07:00,(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!
<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 06:13:00-07:00,(phillip.allen@enron.com),(randall.gay@enron.com),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s..."
<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 05:07:00-07:00,(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.


In [8]:
#Helper function - Removing stop words, removel unimpotant words.
#import nltk
#nltk.download(stopwords)
#nltk.download(wordnet)
def clean(text):
    #stop = set(stopwords.words('english'))
    #stop.update(('http', 'www', 'com' ))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    dash_pat = re.compile("--+.+--+", flags = re.DOTALL)
    star_pat = re.compile('\*\*+.+\*\*+', flags = re.DOTALL)
    uscore_pat = re.compile(" __+.+__+", flags = re.DOTALL)
    equals_pat = re.compile("==+.+==+", flags = re.DOTALL)
    to_pat = re.compile(r'^To:.*\n?',flags = re.MULTILINE)
    cc_pat = re.compile(r'^cc:.*\n?',flags = re.MULTILINE)
    subject_pat = re.compile(r'^Subject:.*\n?', flags = re.MULTILINE)
    nochar_pat = re.compile(r'[^a-zA-Z]')
    shortkiller = re.compile('\b\w{1,3}\b')
    text = text.rstrip()
    text = dash_pat.sub(' ', text)
    text = star_pat.sub(' ', text)
    text = uscore_pat.sub(' ', text)
    text = to_pat.sub('', text)
    text = cc_pat.sub('', text)
    text = subject_pat.sub('', text)
    text = nochar_pat.sub(' ', text)
    #text = shortkiller.sub(' ', text)
    #stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    #punc_free = ' '.join(ch for ch in stop_free.split() if ch not in exclude)
    #shortkill = ' '.join(ch for ch in stop_free.split() if len(ch) > 2)
    normalized = " ".join(lemma.lemmatize(word, pos='v') for word in text.split())
    
    return normalized

In [9]:
emails_df["clean_content"]=emails_df.content.apply(clean)

In [10]:
emails_df.head()

Unnamed: 0_level_0,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,clean_content
Message-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 16:39:00-07:00,(phillip.allen@enron.com),(tim.belden@enron.com),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,Here be our forecast
<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 13:51:00-07:00,(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,Traveling to have a business meet take the fun...
<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 03:00:00-07:00,(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,test successful way to go
<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 06:13:00-07:00,(phillip.allen@enron.com),(randall.gay@enron.com),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",Randy Can you send me a schedule of the salary...
<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 05:07:00-07:00,(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,Let s shoot for Tuesday at


In [11]:
analysis_df=emails_df[['clean_content']].dropna().copy()

from google.colab import files

analysis_df.to_csv('for_training.txt', index = False)
files.download('for_training.txt')

In [12]:
analysis_df.head()

Unnamed: 0_level_0,clean_content
Message-ID,Unnamed: 1_level_1
<18782981.1075855378110.JavaMail.evans@thyme>,Here be our forecast
<15464986.1075855378456.JavaMail.evans@thyme>,Traveling to have a business meet take the fun...
<24216240.1075855687451.JavaMail.evans@thyme>,test successful way to go
<13505866.1075863688222.JavaMail.evans@thyme>,Randy Can you send me a schedule of the salary...
<30922949.1075863688243.JavaMail.evans@thyme>,Let s shoot for Tuesday at


In [14]:
analysis_df.to_csv('train_set.txt', index = False,sep=' ', header=False)