# 01 Importing required libraries

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

## 02 Sampling the dataset

In [2]:
# emails_dataframe = pd.read_csv('emails.csv')
# emails_dataframe.shape

In [3]:
# emails_sample = emails_dataframe.sample(n = 10000)
# emails_sample.to_csv('emails_10000.csv', index=False)

In [4]:
# emails_sample = emails_dataframe.sample(n = 1000)
# emails_sample.to_csv('emails_1000.csv', index=False)

In [5]:
# emails_sample = emails_dataframe.sample(n = 3000)
# emails_sample.to_csv('emails_3000.csv', index=False)

In [6]:
# emails_sample = emails_dataframe.sample(n = 5000)
# emails_sample.to_csv('emails_5000.csv', index=False)

## 03 Choosing a dataset to work with

In [60]:
emails = pd.read_csv('emails_3000.csv')
print(emails.shape)
emails.head()

(3000, 2)


Unnamed: 0,file,message
0,presto-k/deleted_items/72.,Message-ID: <1899108.1075841428486.JavaMail.ev...
1,kean-s/sent/1612.,Message-ID: <10787322.1075848258321.JavaMail.e...
2,hayslett-r/notes_inbox/93.,Message-ID: <22047465.1075844303899.JavaMail.e...
3,dasovich-j/deleted_items/46.,Message-ID: <11706130.1075851586002.JavaMail.e...
4,mcconnell-m/_sent_mail/734.,Message-ID: <19460780.1075843995466.JavaMail.e...


## 04 Preprocessing

In [61]:
sample_email = emails['message'][0]
print(sample_email)

Message-ID: <1899108.1075841428486.JavaMail.evans@thyme>
Date: Mon, 4 Feb 2002 08:20:43 -0800 (PST)
From: jennifer.mcquade@enron.com
To: caroline.abramo@enron.com, tim.belden@enron.com, robert.benson@enron.com, 
	f..calger@enron.com, paul.choi@enron.com, chad.clark@enron.com, 
	mike.cowan@enron.com, mike.curry@enron.com, dana.davis@enron.com, 
	chris.dorland@enron.com, dale.furrow@enron.com, n..gilbert@enron.com, 
	fred.lagrasta@enron.com, john.llodra@enron.com, 
	michael.mcdonald@enron.com, h..otto@enron.com, m..presto@enron.com, 
	reagan.rorschach@enron.com, stewart.rosman@enron.com, 
	douglas.smith@enron.com, paul.thomas@enron.com, 
	barry.tycholiz@enron.com, larry.valderrama@enron.com, 
	frank.vickers@enron.com, greg.whalley@enron.com, 
	john.zufferli@enron.com
Subject: More Dial-in Numbers for 10:30 Meeting
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Mcquade, Jennifer </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JMCQUAD2>
X-To: Abramo, 

### 4.0.1 Get entire email thread...
### 4.0.2 Include email subjects...

In [71]:
# email_thread = []

# def parse_raw_message(lines, line_no):
#     email = {}
#     message = ''
#     keys_to_extract = ['from', 'to']
    
#     if line_no >= len(lines):
#         return 
    
#     for line in lines[line_no: ]:
#         if 'Original Message' in line:
#             return email, line_no
            
#         if ':' not in line:
#             message += line.strip()
#             email['body'] = message
#         else:
#             pairs = line.split(':')
#             key = pairs[0].lower()
#             value = pairs[1].strip()
#             if key in keys_to_extract:
#                 email[key] = value
        
#         line_no += 1
#     return email, line_no

In [72]:
# lines = sample_email.split('\n')

# email, next_line = parse_raw_message(lines, 0)
# email, next_line = parse_raw_message(lines, next_line + 1)
# email, next_line = parse_raw_message(lines, next_line + 1)

### 4.1 Work with the most recent email

In [73]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    
    for line in lines:
        if 'Original Message' in line:
            break
        
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pair = line.split(':')
            key = pair[0].lower()
            value = pair[1].strip()
            if key in keys_to_extract:
                email[key] = value
    return email

def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body'),
        'from': map_to_list(emails, 'from'),
        'to': map_to_list(emails, 'to')
    } 

In [74]:
email_data = pd.DataFrame(parse_into_emails(emails['message']))
print(email_data.shape)
email_data.head()

(3000, 3)


Unnamed: 0,body,from,to
0,"f..calger@enron.com, paul.choi@enron.com, chad...",jennifer.mcquade@enron.com,"caroline.abramo@enron.com, tim.belden@enron.co..."
1,"Decade=09""SCIENTECH IssueAlert"" <IssueAlert@sc...",steven.kean@enron.com,bernadette.hawkins@enron.com
2,Kilmer stopped in to discuss where we are on t...,david.rosenberg@enron.com,"rod.hayslett@enron.com, robert.kilmer@enron.com"
3,"harry.kingerski@enron.com, leslie.lawner@enron...",jeff.dasovich@enron.com,"d..steffes@enron.com, paul.kaufman@enron.com, ..."
4,"Richard,Life does move on! What is Malcomb t...",mike.mcconnell@enron.com,richard.harper@enron.com


In [75]:
email_data.isnull().sum()

body    0
from    0
to      0
dtype: int64

In [77]:
# X_dense = X.todense()
# coords = PCA(n_components=2).fit_transform(X_dense)
# plt.scatter(coords[:,0],coords[:,1], c='r')
# plt.show()

In [97]:
def top_features_tfidf(row, features, top = 20):
    top_ids = np.argsort(row)[::-1][:top]
    top_features = [(features[i], row[i]) for i in top_ids]
    return pd.DataFrame(top_features, columns=['features', 'score'])

def top_features_in_document(X, features, row_id, top = 25):
    row = np.squeeze(X[row_id].toarray())
    return top_features_tfidf(row, features, top)

In [116]:
stop_words = ENGLISH_STOP_WORDS.union(['com','enron',''])
vect = TfidfVectorizer(stop_words=stop_words, analyzer='word')
X = vect.fit_transform(email_data['body'])

In [118]:
features = vect.get_feature_names_out()
top_features_in_document(X, features, 1, 25)

Unnamed: 0,features,score
0,constellation,0.526459
1,plant,0.351942
2,nuclear,0.2504
3,scientech,0.162599
4,power,0.159789
5,energy,0.152585
6,california,0.145619
7,percent,0.122461
8,issuealert,0.110766
9,state,0.110657


0      (0, 11794)\t0.05666768844755233\n  (0, 3893)...
Name: 0, dtype: object
