In [1]:
import pandas as pd
import os
import re
file_path = '../data/raw/emails.csv'

try:
    email_data = pd.read_csv(file_path)
    # Continue with your data processing...
    
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found")
    print("Please download the Enron email dataset from Kaggle: https://www.kaggle.com/datasets/wcukierski/enron-email-dataset")
    print(f"After downloading, place 'emails.csv' in the directory: '{os.path.dirname(file_path)}'")
except pd.errors.EmptyDataError:
    print("Error: The file is empty or corrupt")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

# Get the first 5000 records
part_records = email_data.head(5000)
part_records.iloc[1]  


file                                  allen-p/_sent_mail/10.
message    Message-ID: <15464986.1075855378456.JavaMail.e...
Name: 1, dtype: object

# see structure [link](http://www.enron-mail.com/email/allen-p/)

In [2]:
# file / message 
print(email_data.loc[3025, 'file']) 
print(email_data.loc[3025, 'message']) 

allen-p/sent/99.
Message-ID: <27210125.1075855681929.JavaMail.evans@thyme>
Date: Fri, 8 Sep 2000 05:29:00 -0700 (PDT)
From: phillip.allen@enron.com
To: pallen70@hotmail.com
Subject: Westgate Proforma-Phillip Allen.xls
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: pallen70@hotmail.com
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\Sent
X-Origin: Allen-P
X-FileName: pallen.nsf

---------------------- Forwarded by Phillip K Allen/HOU/ECT on 09/08/2000 
12:28 PM ---------------------------


"George Richards" <cbpres@austin.rr.com> on 09/08/2000 05:21:49 AM
Please respond to <cbpres@austin.rr.com>
To: "Phillip Allen" <pallen@enron.com>
cc: "Larry Lewter" <retwell@mail.sanmarcos.net> 
Subject: Westgate Proforma-Phillip Allen.xls


Enclosed is the preliminary proforma for the Westgate property is Austin
that we told you about.  As you can tell from the proforma this project
should produce a truly exce

 # lets get every email send From: phillip.allen@enron.com use part_records

In [8]:
# Filter emails sent from phillip.allen@enron.com
phillip_allen_send_emails = part_records[part_records['message'].str.contains('phillip.allen@enron.com', na=False)]
# phillip_allen_emails = part_records[part_records['file'].str.contains('allen-p/_sent_mail/', na=False)]

# Display the filtered emails
phillip_allen_send_emails

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...
3962,arnold-j/all_documents/149.,Message-ID: <20807917.1075857570536.JavaMail.e...
4023,arnold-j/all_documents/203.,Message-ID: <6837511.1075857571754.JavaMail.ev...
4288,arnold-j/all_documents/442.,Message-ID: <10476134.1075857577015.JavaMail.e...
4555,arnold-j/all_documents/685.,Message-ID: <20061364.1075857605486.JavaMail.e...


# train with small dataset

In [None]:
# format
# file,message
# allen-p/_sent_mail/1.,"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
# Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
# From: phillip.allen@enron.com
# To: tim.belden@enron.com
# Subject: 
# Mime-Version: 1.0
# Content-Type: text/plain; charset=us-ascii
# Content-Transfer-Encoding: 7bit
# X-From: Phillip K Allen
# X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
# X-cc: 
# X-bcc: 
# X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
# X-Origin: Allen-P
# X-FileName: pallen (Non-Privileged).pst

# Here is our forecast

#  "
# allen-p/_sent_mail/10.,"Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>
# Date: Fri, 4 May 2001 13:51:00 -0700 (PDT)
# From: phillip.allen@enron.com
# To: john.lavorato@enron.com
# Subject: Re:
# Mime-Version: 1.0
# Content-Type: text/plain; charset=us-ascii
# Content-Transfer-Encoding: 7bit
# X-From: Phillip K Allen
# X-To: John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>
# X-cc: 
# X-bcc: 
# X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
# X-Origin: Allen-P
# X-FileName: pallen (Non-Privileged).pst

# Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

# As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  

# My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.
# "

In [11]:
import pandas as pd
import re

def format_email(email: pd.DataFrame) -> pd.DataFrame:
    formated_email = email.copy()

    formated_email['subject'] = formated_email['message'].str.extract(r'Subject: (.*)')
    formated_email['body'] = formated_email['message'].str.extract(r'(?:\r?\n){2,}(.*)', flags=re.DOTALL).iloc[:, 0].str.strip()

    from_email = formated_email['message'].str.extract(r'From: (.*)').iloc[:, 0].fillna('')
    to_email = formated_email['message'].str.extract(r'To: (.*)').iloc[:, 0].fillna('')
    date = formated_email['message'].str.extract(r'Date: (.*)').iloc[:, 0].fillna('')

    # 拼接额外信息到正文中
    formated_email['body'] = formated_email['body'].fillna('') + \
        '\n\nFrom: ' + from_email + \
        '\nTo: ' + to_email + \
        '\nDate: ' + date

    return formated_email[['subject', 'body']]



In [12]:
phillip_allen_emails = format_email(phillip_allen_send_emails)

In [13]:
phillip_allen_all_emails= format_email(part_records[part_records['file'].str.contains('allen-p', na=False)])
phillip_allen_emails['body'].iloc[0]

'Here is our forecast\n\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)'

In [6]:
phillip_allen_emails = phillip_allen_emails[[ 'subject', 'body']]

phillip_allen_emails[:500].to_csv('../data/train_dataset/phillip_allen_emails.csv', index=False)
# Save the filtered emails to a new CSV file

get a full conversation with somebody， user is phillip.allen@enron.com ，other is pallen70@hotmail.com ， asc  maybe useful to know more about project topic between

In [16]:

def extract_email_info(format_email: pd.DataFrame, who) -> pd.DataFrame:
    # find body include who
    email = format_email[format_email['body'].str.contains(who, na=False)]
    return email

In [17]:


phillip_allen_all_emails[:500].to_csv('../data/train_dataset/phillip_allen_all_emails.csv', index=False)

# stagecoachmama@hotmail.com
stagecoachmama_emails = extract_email_info(phillip_allen_all_emails, 'stagecoachmama@hotmail.com')
stagecoachmama_emails.to_csv('../data/train_dataset/stagecoachmama_emails.csv', index=False)