In [1]:
import pandas as pd
import os
file_path = '../data/raw/emails.csv'

try:
    email_data = pd.read_csv(file_path)
    # Continue with your data processing...
    
except FileNotFoundError:
    print(f"Error: File '{file_path}' not found")
    print("Please download the Enron email dataset from Kaggle: https://www.kaggle.com/datasets/wcukierski/enron-email-dataset")
    print(f"After downloading, place 'emails.csv' in the directory: '{os.path.dirname(file_path)}'")
except pd.errors.EmptyDataError:
    print("Error: The file is empty or corrupt")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

# Get the first 5000 records
part_records = email_data.head(5000)
part_records.iloc[1]  


file                                  allen-p/_sent_mail/10.
message    Message-ID: <15464986.1075855378456.JavaMail.e...
Name: 1, dtype: object

In [2]:
# file / message 
print(email_data.loc[3025, 'file']) 
print(email_data.loc[3025, 'message']) 

allen-p/sent/99.
Message-ID: <27210125.1075855681929.JavaMail.evans@thyme>
Date: Fri, 8 Sep 2000 05:29:00 -0700 (PDT)
From: phillip.allen@enron.com
To: pallen70@hotmail.com
Subject: Westgate Proforma-Phillip Allen.xls
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: pallen70@hotmail.com
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\Sent
X-Origin: Allen-P
X-FileName: pallen.nsf

---------------------- Forwarded by Phillip K Allen/HOU/ECT on 09/08/2000 
12:28 PM ---------------------------


"George Richards" <cbpres@austin.rr.com> on 09/08/2000 05:21:49 AM
Please respond to <cbpres@austin.rr.com>
To: "Phillip Allen" <pallen@enron.com>
cc: "Larry Lewter" <retwell@mail.sanmarcos.net> 
Subject: Westgate Proforma-Phillip Allen.xls


Enclosed is the preliminary proforma for the Westgate property is Austin
that we told you about.  As you can tell from the proforma this project
should produce a truly exce

 # lets get every email send From: phillip.allen@enron.com use part_records

In [3]:
# Filter emails sent from phillip.allen@enron.com
phillip_allen_emails = part_records[part_records['message'].str.contains('From: phillip.allen@enron.com', na=False)]
# phillip_allen_emails = part_records[part_records['file'].str.contains('allen-p/_sent_mail/', na=False)]

# Display the filtered emails
phillip_allen_emails

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
...,...,...
3021,allen-p/sent/95.,Message-ID: <32465930.1075855681841.JavaMail.e...
3022,allen-p/sent/96.,Message-ID: <26280667.1075855681863.JavaMail.e...
3023,allen-p/sent/97.,Message-ID: <22194926.1075855681884.JavaMail.e...
3024,allen-p/sent/98.,Message-ID: <5509840.1075855681905.JavaMail.ev...


In [4]:
phillip_allen_receive_emails = part_records[part_records['message'].str.contains('To: phillip.allen@enron.com', na=False)]
# phillip_allen_emails = part_records[part_records['file'].str.contains('allen-p/_sent_mail/', na=False)]

# Display the filtered emails
phillip_allen_receive_emails

Unnamed: 0,file,message
667,allen-p/all_documents/16.,Message-ID: <8051748.1075855665834.JavaMail.ev...
689,allen-p/all_documents/18.,Message-ID: <31871172.1075855665878.JavaMail.e...
832,allen-p/all_documents/31.,Message-ID: <4090398.1075855666178.JavaMail.ev...
875,allen-p/all_documents/35.,Message-ID: <15067558.1075855666263.JavaMail.e...
889,allen-p/all_documents/362.,Message-ID: <7614929.1075855693773.JavaMail.ev...
908,allen-p/all_documents/38.,Message-ID: <4386611.1075855666328.JavaMail.ev...
975,allen-p/all_documents/44.,Message-ID: <30539371.1075855666459.JavaMail.e...
992,allen-p/all_documents/455.,Message-ID: <14346188.1075855696029.JavaMail.e...
1042,allen-p/all_documents/50.,Message-ID: <32727179.1075855666592.JavaMail.e...
1694,allen-p/discussion_threads/190.,Message-ID: <22187423.1075855677436.JavaMail.e...


# train with small dataset

In [5]:
phillip_allen_emails[:500].to_csv('phillip_allen_emails.csv', index=False)
# Save the filtered emails to a new CSV file

get a full conversation with somebody， user is phillip.allen@enron.com ，other is pallen70@hotmail.com ， asc  maybe useful to know more about project topic between

In [7]:
import re
def get_email_summary_text(df):

    entries = []
    for idx, row in df.iterrows():
        msg = row.get('message', '')
        # use re
        from_ = re.search(r'From:\s*(.*)', msg)
        to = re.search(r'To:\s*(.*)', msg)
        cc = re.search(r'X-cc:\s*(.*)', msg)
        bcc = re.search(r'X-bcc:\s*(.*)', msg)
        subject = re.search(r'Subject:\s*(.*)', msg)
        date = re.search(r'Date:\s*(.*)', msg)

        
        # body
        body_start = msg.find('\n\n')
        body = msg[body_start+2:] if body_start != -1 else ''
        from_ = from_.group(1).strip() if from_ else ''
        to = to.group(1).strip() if to else ''
        cc = cc.group(1).strip() if cc else ''
        bcc = bcc.group(1).strip() if bcc else ''
        subject = subject.group(1).strip() if subject else ''
        entries.append(
            f"""From: {from_}
To: {to}
CC: {cc}
BCC: {bcc}
Subject: {subject}
Body: {body.strip()}
-------------------------------"""
        )
    return "\n".join(entries)

In [9]:
# Extract the date from the 'message' column
part_records['date'] = part_records['message'].str.extract(r'Date:\s*(.*)')

# Convert the extracted date to a datetime object for proper sorting
part_records['date'] = pd.to_datetime(part_records['date'], errors='coerce')

# Filter messages involving both email addresses
conversation = part_records[
    part_records['message'].str.contains('phillip.allen@enron.com', na=False) &
    part_records['message'].str.contains('pallen70@hotmail.com', na=False)
]

# Sort the conversation in ascending order by date
conversation_sorted = conversation.sort_values(by='date', ascending=True)

# Display the sorted conversation
conversation_sorted
reindex_conversation_sorted = conversation_sorted.reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part_records['date'] = part_records['message'].str.extract(r'Date:\s*(.*)')
  part_records['date'] = pd.to_datetime(part_records['date'], errors='coerce')
  part_records['date'] = pd.to_datetime(part_records['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part_records['date'] = pd.to_datetime(part_records['date'], errors='coerce')


In [None]:
print(reindex_conversation_sorted['message'][0])
print(reindex_conversation_sorted['message'][1])
print(reindex_conversation_sorted['message'][2])
# what we can du with rerererereply
# token not enough

Message-ID: <2559262.1075855673271.JavaMail.evans@thyme>
Date: Sat, 11 Dec 1999 06:39:00 -0800 (PST)
From: phillip.allen@enron.com
To: pallen70@hotmail.com
Subject: Stick it in your Shockmachine!
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: pallen70@hotmail.com
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Dec2000\Notes Folders\Discussion threads
X-Origin: Allen-P
X-FileName: pallen.nsf

---------------------- Forwarded by Phillip K Allen/HOU/ECT on 12/11/99 02:39 
PM ---------------------------


"the shockwave.com team" <shockwave.com@shockwave.m0.net> on 11/05/99 
02:49:43 AM
Please respond to shockwave.com@shockwave.m0.net
To: Phillip K Allen/HOU/ECT@ECT
cc:  
Subject: Stick it in your Shockmachine!



First one's free. So are the next thousand.

You know it's true: Video games are addictive. Sure, we could
trap you with a free game of Centipede, then kick up the price
after you're hooked. But that's not how sh