## Enron Email Dev Challenge project
Public dataset Enron Emails, https://www.cs.cmu.edu/~./enron/. Dataset version May 7, 2015.

By Anette Karhu

### Task 2 outline

Calculate the average number of emails received per day per employee per day of week (monday, tuesday, etc.).
An employee is here defined as a person whos shortened name appears on the folder names on maildir, for example taylor-m.

The result should be a CSV file that contains three columns (with header row included):

employee: the shortname of the employee
day_of_week: day of week is a number 0-6, where 0 is monday, 1 tuesday etc
avg_count: average number of emails received on the corresponding day of week by the corresponding employee

## Program running instructions

To run the code, run all cells in ipython.
Give path toenron emails in following way: Enron_emails_average('path\to\all\enron\mails\root\maildir').

To calculate the average of mails per user, give folder path to where you want to save the csv file (emails_sent_average_per_weekday.csv):
Enron_emails_average.calculate_emails_average_per_day('path\to\csv\folder)

The program outputs None when it has run the program and created the csv file to given location.

Program uses mostly python's built in libraries that are first imported. Only one library, pandas-library need to be installed.

In [4]:
from email.parser import BytesParser, Parser, BytesHeaderParser
from email.policy import default
import pandas as pd
import os
import csv
from pathlib import Path
import time
import itertools
from email.utils import parsedate_to_datetime
import datetime

In [5]:
class Enron_emails_average:
    '''
    A class where all users all emails amount per week day are counted into a csv.
    '''
    def __init__(self, root_directory):
        '''
        Initialize with root directory of where the enron emails are located.
        '''
        self.root_directory = root_directory
    
    def all_files(self):
        '''
        List of paths to all files for all users.
        Used for reading email data from all users and all folders.
        '''
        all_dirs = [(os.path.join(root,file)) for root,dirs,files in os.walk(self.root_directory) for file in files]
        return all_dirs


    def dates_parsing(self, file_path):
        '''
        Parse week day from all emails headers.
        Change weekdays to numbers, from 0-6, 0=Monday, 6=Sunday etc.
        Save sender and week day into csv.
        '''
        user_date_list =[]
#         i = 0   
        for index, mail in enumerate(self.all_files()):
#             i += 1
#             if i == 3000:
#                 break
            with open(mail, 'rb') as fp:    
                email = BytesHeaderParser().parse(fp)
                parse_dates = format(email['Date'])
                date= parsedate_to_datetime(parse_dates).date()
                weekdays = parse_dates.split(',' )[0]
                days_as_num = time.strptime(weekdays, '%a').tm_wday
                folder_names = Path(mail).parts
                usernames = os.listdir(self.root_directory)
                user_day_list = [(username,days_as_num, date) for folder_name in folder_names for username in usernames if username in folder_name]
                user_date_list.append(list(itertools.chain(*user_day_list)))
        csv_file_path = os.path.join(file_path, 'emails_sent_average_per_weekday.csv')
        csv_writer = csv.writer(open(csv_file_path, 'w', newline='', encoding="utf-8"))
        csv_writer.writerows(user_date_list)
        return csv_file_path
    
    def calculate_emails_average_per_day(self, filepath):
        '''
        Calculate the average amount of emails per day per user.
        '''
        csv_path = self.dates_parsing(filepath)
        csv_to_df = pd.read_csv(csv_path)
        csv_to_df.columns = ['employee', 'day_of_week', 'date']
        duplicates_in_data_df = csv_to_df.pivot_table(index=['employee', 'day_of_week', 'date'], aggfunc='size')
        duplicates_in_data_df = pd.DataFrame(duplicates_in_data_df)
        duplicates_in_data_df.rename(columns={0:'counter'}, inplace=True)
        average_mails = duplicates_in_data_df.groupby(['employee', 'day_of_week']).mean()
        average_mails.to_csv(csv_path)
        return


In [6]:
def main():
    root_for_files = Enron_emails_average(r'C:\Users\Anette\Documents\Enron_Emails_project\enron_emails\maildir')
    print(Enron_emails_average.calculate_emails_average_per_day(root_for_files, r'C:\Users\Anette\Documents\enron_emails'))
    
if __name__ == "__main__":
    main()


None
