In [1]:
import re
import pandas as pd
import os
import numpy as np
from datetime import datetime

In [2]:
def _extract_email_metadata(filename):
    meta_output = {
        'Message-ID':None,
        'Date':None,
        'From':None,
        'To':None,
        'Cc':None,
        'Bcc':None,
        'Subject':None,
        'Mime-Version':None,
        'Content-Type':None,
        'Content-Transfer-Encoding':None,
        'X-From':None,
        'X-To':None,
        'X-cc':None,
        'X-bcc':None,
        'X-Folder':None,
        'X-Origin':None,
        'X-FileName':None
    }
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
        _message_id = re.search(r"^Message-ID:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _date =  re.search(r"^Date:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _from = re.search(r"^From:[^\S\r\n]+(.*?)(?=[\r\n]^To|[\r\n]^Subject|[\r\n]^Cc|[\r\n]^Mime-Version)", content, flags=re.MULTILINE | re.DOTALL)
        _to = re.search(r"^From:[^\S\r\n]+\S*?[\r\n]{1}^To:[^\S\r\n]+(.*?)(?=[\r\n]^Subject|[\r\n]^Cc|[\r\n]^Mime-Version)", content, flags=re.MULTILINE | re.DOTALL)
        _subject = re.search(r"^Subject:[^\S\r\n]+(.*?)(?=[\r\n]^Cc|[\r\n]^Mime-Version)", content, flags=re.MULTILINE | re.DOTALL)
        _cc = re.search(r"^Cc:[^\S\r\n]+(.*?)(?=[\r\n]^Mime-Version)", content, flags=re.MULTILINE | re.DOTALL)
        _mime_version = re.search(r"^Mime-Version:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _content_type = re.search(r"^Content-Type:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _content_transfer_encoding = re.search(r"^Content-Transfer-Encoding:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _bcc = re.search(r"^Bcc:[^\S\r\n]+(.*?)(?=[\r\n]^X-From)", content, flags=re.MULTILINE | re.DOTALL)
        _x_from = re.search(r"^X-From:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _x_to = re.search(r"^X-To:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _x_cc = re.search(r"^X-cc:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _x_bcc = re.search(r"^X-bcc:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _x_folder = re.search(r"^X-Folder:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _x_origin = re.search(r"^X-Origin:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
        _x_filename = re.search(r"^X-FileName:[^\S\r\n]+(.*)$", content, flags=re.MULTILINE)
    
    meta_output['Message-ID'] = _message_id.group(1) if _message_id else None
    meta_output['Date'] =  _date.group(1) if _date else None
    meta_output['From'] = _from.group(1) if _from else None
    meta_output['To'] = _to.group(1) if _to else None
    meta_output['Cc'] = _cc.group(1) if _cc else None
    meta_output['Bcc'] = _bcc.group(1) if _bcc else None
    meta_output['Subject'] = _subject.group(1) if _subject else None
    meta_output['Mime-Version'] = _mime_version.group(1) if _mime_version else None
    meta_output['Content-Type'] = _content_type.group(1) if _content_type else None
    meta_output['Content-Transfer-Encoding'] = _content_transfer_encoding.group(1) if _content_transfer_encoding else None
    meta_output['X-From'] = _x_from.group(1) if _x_from else None
    meta_output['X-To'] = _x_to.group(1) if _x_to else None
    meta_output['X-cc'] = _x_cc.group(1) if _x_cc else None
    meta_output['X-bcc'] = _x_bcc.group(1) if _x_bcc else None
    meta_output['X-Folder'] = _x_folder.group(1) if _x_folder else None
    meta_output['X-Origin'] = _x_origin.group(1) if _x_origin else None
    meta_output['X-FileName'] = _x_filename.group(1) if _x_filename else None

    return meta_output
    

In [3]:
def _get_path_after_target(path, target="maildir"):
    # Split the path into its individual components
    parts = path.split(os.sep)

    # Check if the target exists in the parts
    if target in parts:
        # Get the index of the target
        target_index = parts.index(target)

        # Return the path one level after target if it exists
        if target_index + 1 < len(parts):
            return os.path.join(*[parts[i] for i in range(target_index + 2, len(parts))])

    return np.nan

In [4]:
def _get_folder_after_target(path, target="maildir"):
    # Split the path into its individual components
    parts = path.split(os.sep)

    # Check if the target exists in the parts
    if target in parts:
        # Get the index of the target
        target_index = parts.index(target)

        # Return the next folder if it exists
        if target_index + 1 < len(parts):
            return parts[target_index + 1]

    return np.nan

In [5]:
# Define the path to the input and output directories and files
INPUT = 'input'
MAILDIR = 'maildir'
OUTPUT = 'output'
STRUCTURED_DATA = 'structured_data_'
PROFILING_OUTPUT = 'profiling_output_'
TRANSFORMATION_OUTPUT = 'transformation_output_'
IMPORT_SUCCESS = 'SUCCESS'

input_dir = os.path.join(
    os.path.abspath('..'),
    INPUT,
    MAILDIR
)
structured_data_output_path = os.path.join(
    os.path.abspath('..'),
    OUTPUT,
    STRUCTURED_DATA + datetime.now().strftime("%Y%m%d_%H%M%S") + '.csv'
)
profiling_output_path = os.path.join(
    os.path.abspath('..'),
    OUTPUT,
    PROFILING_OUTPUT + datetime.now().strftime("%Y%m%d_%H%M%S") + '.csv'
)
transformation_output_path = os.path.join(
    os.path.abspath('..'),
    OUTPUT,
    TRANSFORMATION_OUTPUT + datetime.now().strftime("%Y%m%d_%H%M%S") + '.csv'
)


In [9]:
df = pd.DataFrame(columns=[
    'maildir_user', 
    'maildir_folder', 
    'maildir_file_name', 
    'maildir_path',
    'Message-ID',
    'Date',
    'From',
    'To',
    'Cc',
    'Bcc',
    'Subject',
    'Mime-Version',
    'Content-Type',
    'Content-Transfer-Encoding',
    'X-From',
    'X-To',
    'X-cc',
    'X-bcc',
    'X-Folder',
    'X-Origin',
    'X-FileName',
    'import_result'
])
# iterate through the input_dir and save the first level of directories as user, the second level as folder,
# the third level as file_name, and the content of the files as content
loc_i = 0
for root, dirs, files in os.walk(input_dir):
    try:
        if files:
            for file in files:
                df.loc[loc_i, 'maildir_user'] = _get_folder_after_target(root)
                df.loc[loc_i, 'maildir_folder'] = _get_path_after_target(root)
                df.loc[loc_i, 'maildir_file_name'] = file
                df.loc[loc_i, 'maildir_path'] = os.path.join(root,file)
                content = _extract_email_metadata(os.path.join(root,file))
                for k,v in content.items():
                    df.loc[loc_i, k] = v
                df.loc[loc_i, 'import_result'] = IMPORT_SUCCESS
                loc_i += 1
    except Exception as e:
        df.loc[loc_i, 'import_result'] = f"ERROR:{e} | ROOT:{root} | FILE:{file}"
        continue
    if loc_i > 5000:
        df.to_csv(structured_data_output_path, index=False)
        break

In [None]:
# this block is only needed if we'd like to read the df from some existing
# data files saved in csv format
load_from = os.path.join(
    os.path.abspath('..'),
    OUTPUT,
    'PUT IN THE SAVED CSV NAME HERE'
)
df = pd.read_csv(load_from)

In [None]:
# convert the comma delimited emails in the From, To, Cc and Bcc fields into lists
# so they can harness the methods of lists
df['From'] = df['From'].apply(lambda x: x.split(',') if type(x) == str else [])
df['To'] = df['To'].apply(lambda x: x.split(',') if type(x) == str else [])
df['Cc'] = df['Cc'].apply(lambda x: x.split(',') if type(x) == str else [])
df['Bcc'] = df['Bcc'].apply(lambda x: x.split(',') if type(x) == str else [])