# Enron Emails dev challenge project

Public dataset Enron Emails, https://www.cs.cmu.edu/~./enron/.
Dataset version May 7, 2015.

By Anette Karhu

## Task outline
### 1) Calculate how many emails were sent from each sender address to each recipient.
The result should be a CSV file that contains three columns (with header row included):

sender: the sending email address,
recipient: the recipient email address
count: number of emails sent from sender to recipient
If an email has multiple recipients, CC's or BCC's, count the email as it would have been sent to each recipient individually.

### 2) Calculate the average number of emails received per day per employee per day of week (monday, tuesday, etc.).
An employee is here defined as a person whos shortened name appears on the folder names on maildir, for example taylor-m.

The result should be a CSV file that contains three columns (with header row included):

employee: the shortname of the employee
day_of_week: day of week is a number 0-6, where 0 is monday, 1 tuesday etc
avg_count: average number of emails received on the corresponding day of week by the corresponding employee

In [6]:
from email.parser import BytesParser, Parser, BytesHeaderParser
from email.policy import default
from email.utils import getaddresses, parseaddr
import pandas as pd
from email.message import EmailMessage
import os
from functools import partial
import csv
import re

In [12]:
class Enron_emails:
    '''
    A class to handle enron email program.
    '''
    CHUNK_SIZE = 4096
    def __init__(self, root_directory):
        '''
        Initialize with root directory of where the enron emails are located.
        '''
        self.root_directory = root_directory
    
    def users_directories_files(self):
        '''
        List of paths to all files for all users.
        Used for reading email data from all users and all folders.
        '''
        all_dirs = [(os.path.join(root,file)) for root,dirs,files in os.walk(self.root_directory) for file in files]
        return all_dirs


    def load_parse_and_save(self, csv_file_path):
        '''
        Loads data, opens binary files, parses only needed items
        makes a lists of receiver and sender email addresses.
        Opens Mimo formatted emails and parses data with email library.
        TODO: write in chunks? It takes with all data appr. 30mins to run this.
        TODO: Parser not perfect, csv file contains unwanted marks.
        '''
        sender_receiver_list =[]
        i = 0    
        for index, mail in enumerate(self.users_directories_files()):
            i += 1
            if i == 300:
                break
            with open(mail, 'rb') as fp:             
#                 data = fp.read()
                email = BytesParser(policy=default).parse(fp)
#                 print(getaddresses(email.get_all('to', [])))
#                 email = BytesHeaderParser().parsebytes(data.as_string())
                sender = format(email['from'])
                if format(email['to']) != 'None':
                    receiver = format(email['to'])
#                     print(receiver)
                    sender_receiver_list.append((sender, receiver))
                if format(email['cc']) != 'None':
                    cc_receiver = format(email['cc']),
                    sender_receiver_list.append((sender, cc_receiver))
#                         print('cc', format(email['cc']))
                if format(email['bcc']) != 'None':
                    bcc_receiver = format(email['bcc']),
                    sender_receiver_list.append((sender, bcc_receiver))
#                         print('bcc', format(email['bcc']))
#                     print(sender,receiver)
#                     sender_receiver_list.append(tuple([sender, receiver]))
#                 print(sender_receiver_list)
        csv_writer = csv.writer(open(csv_file_path, 'w', encoding='utf-8', newline=''))
        csv_writer.writerows(sender_receiver_list)
        return
    

In [13]:
# enron_test = Enron_emails( r'C:\Users\Anette\Documents\Enron_Emails_project\enron_emails\maildir\allen-p')
# # print(enron_test.list_root_files())

enron_all_mails = Enron_emails(r'C:\Users\Anette\Documents\Enron_Emails_project\enron_emails\maildir')
# print(Enron_emails.list_root_files(enron_all_mails))
# print(Enron_emails.users_directories_files(enron_all_mails))
print(Enron_emails.load_parse_and_save(enron_all_mails, r'C:\Users\Anette\Documents\enron_emails\sender_receiver.csv'))
# print(Enron_emails.clean_csv_data(enron_all_mails, r'C:\Users\Anette\Documents\enron_emails\test.csv'))



None


In [None]:
# Let's try to open a small amount of data as the data size is over 2GB.

#one folder for testing
rootdir = r'C:\Users\Anette\Documents\Enron_Emails_project\enron_emails\maildir\allen-p'
# all files
maindir = r'C:\Users\Anette\Documents\Enron_Emails_project\enron_emails\maildir'


In [None]:
# Testing, get all directories.

# all_dirs =[]
# for root,dirs,files in os.walk(maindir):
# # #     print(root)
# #     for d in dirs:
# #         print(os.path.join(root,d))
#     for file in files:
#         all_dirs.append(os.path.join(root,f))
        
all_dirs = [(os.path.join(root,file)) for root,dirs,files in os.walk(maindir) for file in files]

In [None]:
len(all_dirs)
all_dirs[8400]

In [None]:
# root file location for arnold j's sent emails
arnolds_sent_mail_root = r'C:\Users\Anette\Documents\Enron_Emails_project\enron_emails\maildir\arnold-j\_sent_mail'

file_names = [filename for filename in sorted(os.listdir(arnolds_sent_mail_root),key=len)]
# print(file_names)

sent_mails_dirs= [(arnolds_sent_mail_root + '\\' + dir_name) for dir_name in file_names]    
print(sent_mails_dirs)

In [None]:
# Looping one senders (arnold j's) sent emails from file sent_items.
# Making a dictionary of headers: from, to, cc, bcc
# TODO: make dictionary in a loop, nested dictionary! joka riville jokaisesta
# sähköpostista omat tiedot!
# TODO: add email only if it does not exist in dictionary, or delete duplicates
# TODO: change dictionary into pd.dataframe that can be transformed easily into csv format.


receiver_list = []
sender_list = []
# Opens Mimo formatted emails and parses data with email library.
# leaves out none/blank fields in to, cc, and bcc fields. 
for index, mail in enumerate(sent_mails_dirs):
    with open(mail, 'rb') as fp:
        headers = BytesParser(policy=default).parse(fp)
        sender_list.append(format(headers['from']))
        if format(headers['to']) != 'None':
            receiver_list.append(format(headers['to']))
        if format(headers['bcc']) != 'None':
#             print(format(headers['bcc']))
            receiver_list.append(format(headers['bcc']))
        if format(headers['cc']) != 'None':
#             print(format(headers['cc']))
            receiver_list.append(format(headers['cc']))

# Zip together as a list the two separate lists of sender and receiver(s)
tuples= list(zip(sender_list, receiver_list))

print(len(receiver_list), len(sender_list))

# mutta, halutaanko kaikki sähköpostit mitä datassa on, vai vaan käyttäjän lähettämät
# sähköpostiviestit??

In [None]:
# Create a dataframe 
dataframe = pd.DataFrame(tuples, columns=['sender', 'receiver'])
dataframe


In [None]:
# split multiple receivers into rows, add column names again as they disappeared in 
# in the concat method, change the columns order back to original: sender,receiver format.
splitted_receivers_df = pd.concat([pd.Series(row['sender'], row['receiver'].split(', ')) for _, row in dataframe.iterrows()]).reset_index()
splitted_receivers_df.columns =['receiver', 'sender']
splitted_receivers_df = splitted_receivers_df.reindex(columns=['sender', 'receiver'])
splitted_receivers_df

In [None]:
# Counts together how many times certain email address in mentioned in the list
# of sended emails.
counted_data = splitted_receivers_df.pivot_table(index=['sender', 'receiver'], aggfunc='size')
counted_data = pd.DataFrame(counted_data)
counted_data.rename(columns={0:'count'}, inplace=True)
# print(counted_data.columns)
counted_data

In [None]:
# Root file for csv files
root_file_for_csv = r'C:\Users\Anette\Documents\enron_emails'

# Write pd.dataframe into new scv file
# emails_sent_totals to csv
csv_file_root = os.path.join(root_file_for_csv, 'emails_sent_totals.csv') 
# print(csv_file_root)
counted_data.to_csv(csv_file_root)