In [1]:
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import numpy as np
import re
import seaborn as sns
import matplotlib.patches as mpatches
import spacy 
nlp = spacy.load('en_core_web_sm')

In [2]:
# Utility functions

def filenames_from_cat(top,second):
    cats_globs = glob.glob('./*/*/*.cats')
    matches = []
    for g in cats_globs:
        with open(g) as file:
            for line in file:
                line = line.split()[0]
                line = line.split(",")
                if int(line[0]) == top and int(line[1]) == second:
                    dir = os.path.basename(g)
                    filename = os.path.splitext(dir)[0]
                    matches.append(filename)
    
    return matches

def print_email_from_filename(filename):
    full_filename = glob.glob('./*/*/' + str(filename) + '.txt')
    with open(full_filename[0]) as file:
        for line in file:
            print(line)

def save_email_from_filename(filename):
    full_filename = glob.glob('./*/*/' + str(filename) + '.txt')
    email_contents = ""
    with open(full_filename[0]) as file:
        for line in file:
            email_contents = email_contents + line
    return email_contents

def save_all_from_cat(primary, secondary):
    filenames = filenames_from_cat(primary, secondary)
    contents = ""
    for file in filenames:
        contents = contents + save_email_from_filename(file)
        contents = contents + "\n" + ("*" * 100) + "\n\n"
    return contents

def list_all_filenames():
    globs = glob.glob('./*/*/*.cats')
    filenames = []
    for file in globs:
        filenames.append(os.path.splitext(os.path.basename(file))[0])
    return filenames

def length_of_email_from_filename(filename):
    email_text = save_email_from_filename(filename)
    tokenised = word_tokenize(email_text)
    return len(tokenised)

def number_of_recipients(filename):
    addresses = []
    contents = save_email_from_filename(filename)
    contents = contents.split("\n")
    for line in contents:
        if line.startswith("To:"):
            addresses.append(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", line))
    
    return len([j for sub in addresses for j in sub])

def get_sender_from_filename(filename):
    addresses = []
    contents = save_email_from_filename(filename)
    contents = contents.split("\n")
    for line in contents:
        if line.startswith("From:"):
            addresses.append(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", line))
    
    return [j for sub in addresses for j in sub][0]

In [3]:
# Partition emails into sensitive and non-sensitive
sensitive_filenames = []
# Assume 'Purely personal' and 'Personal but in a professional context' are the sensitive categories
sensitive_filenames.append(filenames_from_cat(1,2))
sensitive_filenames.append(filenames_from_cat(1,3))
# Flatten the list
sensitive_filenames = [j for sub in sensitive_filenames for j in sub]
# Remove duplicates - 3 emails are counted in both categories
sensitive_filenames = list(dict.fromkeys(sensitive_filenames))

non_sensitive_filenames = []
for name in list_all_filenames():
    if name not in sensitive_filenames:
        non_sensitive_filenames.append(name)

In [4]:
def show_ents(doc): 
    ents =  []
    if doc.ents: 
        for ent in doc.ents: 
            ents.append([ent.text,ent.label_])
    else: 
        print('No named entities found.')
        
    return ents

In [5]:
def get_entities(filenames):
    ents = []
    
    for filename in filenames:
        email = save_email_from_filename(filename)
        email = nlp(email.replace('\n',' '))
        ents.append(show_ents(email))

    return ents

In [6]:
sensitive_entities = get_entities(sensitive_filenames)
non_sensitive_entities = get_entities(non_sensitive_filenames)

In [17]:
named_people = dict()

for email in sensitive_entities:
    for entity in email:
        if entity[1] == "PERSON":
            if entity[0] not in named_people:
                named_people[entity[0]] = 1
            else:
                named_people[entity[0]] += 1

print({k: v for k, v in sorted(named_people.items(), key=lambda item: item[1],reverse=True)})

{'Davis': 114, 'Vince J\\Sent Items X-Origin': 87, 'Steven J Kean X-To': 82, 'Williams': 63, 'Steven J Kean/NA': 52, 'Vince': 49, 'Miller': 45, 'j.kaminski@enron.com To': 44, 'RMR': 42, 'Gray Davis': 37, 'SoCal Ed': 33, 'Steven J Kean/HOU': 26, 'Richard Shapiro': 24, 'Hebert': 24, 'Reliant': 24, 'Crenshaw': 22, 'Jeff Dasovich': 19, 'Williams Energy': 18, 'Jeff': 16, 'Massey': 16, 'Edison': 15, 'Steve': 14, 'Shirley': 14, 'William': 13, 'Ehud I. Ronn': 11, 'Bush': 10, 'Leno': 10, 'Sarah Novosel': 10, 'Kean-S X-FileName': 9, 'Hudler': 9, 'Jessica Berthold': 9, 'Jason Leopold': 9, 'Debra Bowen': 9, 'Fichera': 9, 'Alamitos': 9, 'Harrigan': 9, 'Reilly': 9, 'Burrito': 8, 'Lynch': 8, 'Edward Krapels': 8, 'Lieberman': 7, 'Kathy Wedig': 7, 'Steve & Melissa Kean': 7, 'Rick': 7, 'Pozdrawiam': 7, 'Linda Robertson': 7, 'Austin': 7, 'TX': 7, 'Mark': 7, 'Steve Kean': 7, 'Ken': 7, 'Joe Lieberman': 6, 'Phil Kean': 6, 'Philippe': 6, 'Steven J.\\Sent Items X-Origin': 6, 'Sarah-Joy': 6, 'Dale Clark': 6, '

In [18]:
named_people = dict()

for email in non_sensitive_entities:
    for entity in email:
        if entity[1] == "PERSON":
            if entity[0] not in named_people:
                named_people[entity[0]] = 1
            else:
                named_people[entity[0]] += 1

print({k: v for k, v in sorted(named_people.items(), key=lambda item: item[1],reverse=True)})

{'Davis': 2995, 'Bush': 1181, 'Steven J Kean X-To': 905, 'Gray Davis': 763, 'Steven J Kean/NA': 735, 'the=20': 442, 'Williams': 395, 'Richard Shapiro': 346, 'Jeff Dasovich': 309, 'Steven J Kean/HOU': 244, 'James D Steffes': 244, 'Jeff': 214, 'Ken': 146, 'Reliant': 139, 't= o=20': 135, 'y=20': 132, 'l=20': 129, 'Steve': 128, 't=': 127, 'Ken Lay': 124, 'Linda': 121, 'Steve Maviglio': 120, 'James D.': 117, 'Maureen McVicker X-cc': 114, 'Richard': 111, 'Mark Palmer': 110, 'Linda Robertson': 109, 'Abraham': 108, 'Clinton': 108, 'Lockyer': 107, 'Massey': 103, 'Sarah Novosel': 102, 'to=20': 102, 'Sarah': 102, 'Paul Kaufman': 100, 'Lynch': 99, 'Cheney': 98, 'Robertson': 98, 'Frank Wolak': 97, 'Alan Comnes': 97, 'Dianne Feinstein': 97, 'Susan J Mara': 95, 's=20': 95, 'Lay': 95, 'Shapiro': 95, 'Mark Palmer/Corp/Enron@ENRON': 93, 'Steven J Kean': 93, 'Elizabeth Linnell': 92, 'Karen Denne': 90, 'Wood': 88, 'Gary Ackerman': 81, 'Cal-ISO': 80, 'Watson': 80, 'Joe Hartsoe': 77, 'Edison': 77, 'Ray Alva