In [1]:
import zipfile
import os

# Path to your dataset zip
zip_path = "../data/archive.zip"
extract_dir = "../data/enron_extracted"

# Step 1: Extract if not already extracted
if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print("✅ Extracted successfully!")
else:
    print("✅ Already extracted!")

# Step 2: Check what files are inside
files = os.listdir(extract_dir)
print("Files inside extracted folder:", files)


✅ Extracted successfully!
Files inside extracted folder: ['emails.csv']


In [3]:
import pandas as pd

file_path = "../data/enron_extracted/emails.csv"

# Load only first 5000 rows to test (since file is huge)
df_sample = pd.read_csv(file_path, nrows=5000)

print("Shape of sample:", df_sample.shape)
print("\nColumn names:\n", df_sample.columns)
print("\nFirst few rows:")
print(df_sample.head())


Shape of sample: (5000, 2)

Column names:
 Index(['file', 'message'], dtype='object')

First few rows:
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [4]:
import pandas as pd

file_path = "../data/enron_extracted/emails.csv"

# Load only first 5000 rows to test
df_sample = pd.read_csv(file_path, nrows=5000)

print("Shape of sample:", df_sample.shape)
print("\nColumn names:\n", df_sample.columns)
print("\nFirst few rows:")
print(df_sample.head())


Shape of sample: (5000, 2)

Column names:
 Index(['file', 'message'], dtype='object')

First few rows:
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [17]:
def parse_email(text):
    """
    Parse raw email text into structured fields.
    """
    headers, _, body = text.partition("\n\n")  # split headers from body

    parsed = {
        "Message-ID": None,
        "Date": None,
        "From": None,
        "To": None,
        "Subject": None,
        "Body": body.strip()
    }

    for line in headers.split("\n"):
        if line.startswith("Message-ID:"):
            parsed["Message-ID"] = line.replace("Message-ID:", "").strip()
        elif line.startswith("Date:"):
            parsed["Date"] = line.replace("Date:", "").strip()
        elif line.startswith("From:"):
            parsed["From"] = line.replace("From:", "").strip()
        elif line.startswith("To:"):
            parsed["To"] = line.replace("To:", "").strip()
        elif line.startswith("Subject:"):
            parsed["Subject"] = line.replace("Subject:", "").strip()

    return parsed


In [19]:
# from src.preprocessing import parse_email

sample_text = df_sample.loc[0, "message"]
parsed = parse_email(sample_text)
print(parsed)


{'Message-ID': '<18782981.1075855378110.JavaMail.evans@thyme>', 'Date': 'Mon, 14 May 2001 16:39:00 -0700 (PDT)', 'From': 'phillip.allen@enron.com', 'To': 'tim.belden@enron.com', 'Subject': '', 'Body': 'Here is our forecast'}


In [20]:
parsed_rows = df_sample["message"].apply(parse_email)
df_parsed = pd.DataFrame(parsed_rows.tolist())

print("Parsed DataFrame shape:", df_parsed.shape)
print("\nColumns:", df_parsed.columns)
print("\nSample parsed rows:")
print(df_parsed.head())


Parsed DataFrame shape: (5000, 6)

Columns: Index(['Message-ID', 'Date', 'From', 'To', 'Subject', 'Body'], dtype='object')

Sample parsed rows:
                                      Message-ID  \
0  <18782981.1075855378110.JavaMail.evans@thyme>   
1  <15464986.1075855378456.JavaMail.evans@thyme>   
2  <24216240.1075855687451.JavaMail.evans@thyme>   
3  <13505866.1075863688222.JavaMail.evans@thyme>   
4  <30922949.1075863688243.JavaMail.evans@thyme>   

                                    Date                     From  \
0  Mon, 14 May 2001 16:39:00 -0700 (PDT)  phillip.allen@enron.com   
1   Fri, 4 May 2001 13:51:00 -0700 (PDT)  phillip.allen@enron.com   
2  Wed, 18 Oct 2000 03:00:00 -0700 (PDT)  phillip.allen@enron.com   
3  Mon, 23 Oct 2000 06:13:00 -0700 (PDT)  phillip.allen@enron.com   
4  Thu, 31 Aug 2000 05:07:00 -0700 (PDT)  phillip.allen@enron.com   

                        To    Subject  \
0     tim.belden@enron.com              
1  john.lavorato@enron.com        Re:   
2   l

In [28]:
# from src.preprocessing import clean_text

df_parsed["clean_body"] = df_parsed["Body"].apply(clean_text)
print(df_parsed[["Body", "clean_body"]].head(5))


                                                Body  \
0                               Here is our forecast   
1  Traveling to have a business meeting takes the...   
2                     test successful.  way to go!!!   
3  Randy,\n\n Can you send me a schedule of the s...   
4                  Let's shoot for Tuesday at 11:45.   

                                          clean_body  
0                                           forecast  
1  travel business meeting take fun trip especial...  
2                                test successful way  
3  randy \n\n  send schedule salary level schedul...  
4                           let shoot tuesday         


In [26]:
import re
import spacy

# Load spaCy small English model (download first time: python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # only tokenizer + tagger + lemmatizer

def clean_text(text):
    """
    Clean email body text for NLP tasks.
    Steps: lowercase, remove non-alphabetic, remove stopwords, lemmatize
    """
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # keep only letters

    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and len(token) > 2
    ]

    return " ".join(tokens)


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # limit to top 5000 terms for speed

# Fit and transform on clean_body
X_tfidf = vectorizer.fit_transform(df_parsed["clean_body"])

print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (5000, 5000)


In [31]:
from sklearn.cluster import KMeans

# Define number of clusters (you can tune this, e.g., 5, 10, 20)
num_clusters = 10  
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)

# Fit the model
kmeans.fit(X_tfidf)

# Assign cluster labels to each email
df_parsed["cluster"] = kmeans.labels_

print(df_parsed[["clean_body", "cluster"]].head(10))


                                          clean_body  cluster
0                                           forecast        5
1  travel business meeting take fun trip especial...        5
2                                test successful way        5
3  randy \n\n  send schedule salary level schedul...        5
4                           let shoot tuesday               5
5                greg \n\n  tuesday thursday phillip        5
6  follow distribution list update phillip allen ...        3
7                                  morning                  5
8      login pallen davis \n\n  don think require...        5
9                          forward phillip allen ...        4


In [32]:
import numpy as np

# Get feature names (words)
terms = vectorizer.get_feature_names_out()

# For each cluster, get top n terms
def get_top_terms_per_cluster(kmeans, terms, n=10):
    top_terms = {}
    for i, center in enumerate(kmeans.cluster_centers_):
        top_idx = center.argsort()[::-1][:n]
        top_terms[i] = [terms[j] for j in top_idx]
    return top_terms

top_terms = get_top_terms_per_cluster(kmeans, terms, n=10)

# Print top words per cluster
for cluster, words in top_terms.items():
    print(f"Cluster {cluster}: {', '.join(words)}")



Cluster 0: request, resource, approval, com, itcapps, enron, srrs, auth, emaillink, act
Cluster 1: ect, john, arnold, enron, hou, subject, com, fraser, jennifer, margaret
Cluster 2: ect, hou, enron, allen, phillip, forward, corp, subject, ee, pdx
Cluster 3: com, phillip, allen, enron, austin, pallen, forward, ect, subject, loan
Cluster 4: enron, need, gas, price, com, know, phillip, let, number, year
Cluster 5: phillip, thank, john, work, west, new, send, email, today, desk
Cluster 6: image, click, iwon, receive, email, mail, unsubscribe, com, amazon, online
Cluster 7: message, gas, file, daily, com, phillip, accenture, recipient, doc, mail
Cluster 8: lucy, rent, rentroll, pay, file, deposit, phillip, miss, week, tenant
Cluster 9: com, http, www, carrfut, zdnet, zdnetonebox, free, pdf, research, soblander
