<a href="https://colab.research.google.com/github/Hands-On-Fraud-Analytics/Chapter-26---Text-Analysis-and-Topic-Modeling/blob/main/Enron_Fraud_Topic_Modeling_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📊 Enron Fraud Detection via Topic Modeling (LDA)
### Author: ABDELRAHEEM AL AQQAD
### Dataset: Enron Email Corpus via Kaggle
### Objective: Identify suspicious messages using unsupervised topic modeling and flag those matching fraud-related topics


**Step 1: Install Required Packages**

In [7]:
!pip install kagglehub swifter gensim pyLDAvis



**Step 2: Import Libraries and Download NLTK Data**

In [8]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Step 3: Download and Load Enron Dataset

In [9]:
import kagglehub

# Download the Enron email dataset
path = kagglehub.dataset_download("wcukierski/enron-email-dataset")

# Load the CSV file
df = pd.read_csv(path + "/emails.csv")
df = df.dropna(subset=['message'])

**Step 4: Prepare the Cleaning Function**

In [10]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
porter = PorterStemmer()

def clean(text):
    if isinstance(text, str):
        text = text.lower().strip()
        words = nltk.word_tokenize(text)
        stop_free = [w for w in words if w not in stop and not w.isdigit()]
        punc_free = [w for w in stop_free if w not in exclude]
        lemmatized = [lemma.lemmatize(w) for w in punc_free]
        stemmed = [porter.stem(w) for w in lemmatized]
        return stemmed
    return []


**Step 5: Sample 1,000 Emails and Clean in Parallel**

In [16]:
import swifter

df_sample = df.sample(10000, random_state=42).reset_index(drop=True)
df_sample['clean_content'] = df_sample['message'].swifter.apply(clean)

Pandas Apply:   0%|          | 0/10000 [00:00<?, ?it/s]

**Step 6: Build Dictionary and Corpus**

In [17]:
from gensim import corpora

text_clean = df_sample['clean_content']
dictionary = corpora.Dictionary(text_clean)
dictionary.filter_extremes(no_below=5, keep_n=50000)
corpus = [dictionary.doc2bow(text) for text in text_clean]

**Step 7: Train the LDA Model**

In [18]:
from gensim.models.ldamodel import LdaModel

ldamodel = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.023*"http" + 0.010*"imag" + 0.008*"--" + 0.006*"pleas" + 0.005*"click"')
(1, '0.031*"\'\'" + 0.017*"``" + 0.015*"\'s" + 0.012*"=20" + 0.011*"enron"')
(2, '0.009*".." + 0.007*"jeff" + 0.007*"john" + 0.006*"j" + 0.006*"mark"')
(3, '0.282*"--" + 0.030*"ect" + 0.021*"enron" + 0.013*"pm" + 0.008*"forward"')
(4, '0.011*"\'\'" + 0.010*"``" + 0.010*"enron" + 0.007*"thank" + 0.007*"e-mail"')


**Step 8: Assign Topics and Create Flag**

In [19]:
def get_topic_details(ldamodel, corpus):
    topic_details_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topic_details_df = pd.concat(
                    [topic_details_df, pd.DataFrame([[int(topic_num), round(prop_topic, 4), topic_keywords]])],
                    ignore_index=True)
    topic_details_df.columns = ['Dominant_Topic', '% Score', 'Topic_Keywords']
    return topic_details_df

# Merge with original text
topic_data = get_topic_details(ldamodel, corpus)
topic_data['Original_Text'] = df_sample['message']
topic_data['flag'] = np.where(topic_data['Dominant_Topic'] == 3, 1, 0)

topic_data.head()

Unnamed: 0,Dominant_Topic,% Score,Topic_Keywords,Original_Text,flag
0,4,0.6225,"'', ``, enron, thank, e-mail, pleas, messag, s...",Message-ID: <21013688.1075844564560.JavaMail.e...,0
1,4,0.9797,"'', ``, enron, thank, e-mail, pleas, messag, s...",Message-ID: <22688499.1075854130303.JavaMail.e...,0
2,4,0.4782,"'', ``, enron, thank, e-mail, pleas, messag, s...",Message-ID: <27817771.1075841359502.JavaMail.e...,0
3,4,0.3951,"'', ``, enron, thank, e-mail, pleas, messag, s...",Message-ID: <10695160.1075858510449.JavaMail.e...,0
4,4,0.9798,"'', ``, enron, thank, e-mail, pleas, messag, s...",Message-ID: <27819143.1075853689038.JavaMail.e...,0


**Step 9 (Optional): Visualize Topics with pyLDAvis**

In [20]:
import pyLDAvis.gensim_models
import pyLDAvis

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)

In [30]:
import os

for filename in os.listdir('/content'):
    if filename.endswith('.ipynb'):
        print(filename)

Enron_Fraud_Topic_Modeling_LDA.ipynb


In [31]:
import nbformat

notebook_path = '/content/Enron_Fraud_Topic_Modeling_LDA.ipynb'

with open(notebook_path) as f:
    nb = nbformat.read(f, as_version=4)

# Remove widget metadata if it exists
if 'widgets' in nb['metadata']:
    del nb['metadata']['widgets']

# Save cleaned version
with open(notebook_path, 'w') as f:
    nbformat.write(nb, f)

print("✅ Notebook cleaned and saved. Ready to upload to GitHub.")


✅ Notebook cleaned and saved. Ready to upload to GitHub.
