In [121]:
import pandas as pd
import re


df = pd.read_csv("../datasets/datasets/enron_spam_dataset.csv")

In [122]:
df.head(20)

Unnamed: 0,email_subject,email_body,is_spam
0,ena sales on hpl,just to update you on this project ' s status ...,0
1,98 - 6736 & 98 - 9638 for 1997 ( ua 4 issues ),the above referenced meters need to be placed ...,0
2,"hpl nominations for december 28 , 1999",( see attached file : hpll 228 . xls )\n- hpll...,0
3,revised nom - kcs resources,"daren ,\nit ' s in .\nbob\n- - - - - - - - - -...",0
4,new production - sitara deals needed,"daren ,\nfyi .\nbob\n- - - - - - - - - - - - -...",0
5,re : another hesco issue,help . steve mauch at hesco is wanting an answ...,0
6,"enron / hpl actuals for august 16 , 2000",teco tap 120 . 000 / hpl iferc ; 20 . 000 / en...,0
7,meter 1031 baytown exxon,daren - the valve for meter 1031 was not shut ...,0
8,ces deal clean - up,i will need to make these changes in sitara . ...,0
9,meter 5097,"daren ,\ndo you know if there should be a deal...",0


In [123]:
len(df)

33715

In [124]:
df.duplicated().sum()


np.int64(3222)

In [125]:
df.duplicated(subset=['email_body']).sum()


np.int64(3936)

In [126]:
df.isnull().sum()

email_subject    289
email_body       371
is_spam            0
dtype: int64

## Inital cleaning

In [None]:

def clean_text(s):
    s = str(s).lower()
    
    # Remove HTML tags first
    s = re.sub(r"<.*?>", " ", s)
    
    # Replace URLs, emails, and numbers with placeholders
    s = re.sub(r'http\S+', '<url>', s)
    s = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '<email>', s) # better matches standard email addresses
    s = re.sub(r'\d+', '<num>', s)
    
    # Remove other punctuation but keep placeholders
    s = re.sub(r'[^a-zA-Z\s<>]', ' ', s)
    
    # Normalize spaces
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

# Apply to your DataFrame
df['clean'] = df['email_body'].apply(clean_text)


Checking to ensure placeholders remain

In [128]:
# Count how many emails contain <URL>
num_with_url = df['clean'].str.contains(r'<url>').sum()

# Count how many contain <EMAIL>
num_with_email = df['clean'].str.contains(r'<email>').sum()

# Count how many contain <NUM>
num_with_num = df['clean'].str.contains(r'<num>').sum()

print("Emails with <url>:", num_with_url)
print("Emails with <email>:", num_with_email)
print("Emails with <num>:", num_with_num)


Emails with <url>: 0
Emails with <email>: 0
Emails with <num>: 0


Placeholders:
Preserve meaningful patterns without overfitting, reduce the noise, and makes it easier to understand what the model is learning

In [129]:
df.sample(10)

Unnamed: 0,email_subject,email_body,is_spam,clean
12186,bc bios,attached are the bios for the following :\nlar...,0,attached are the bios for the following larry ...
12344,edison letter to dwr,- - - - - forwarded by jeff dasovich / na / en...,0,forwarded by jeff dasovich na enron on <NUM> <...
19354,got a special night in mind ? have your viiagr...,ch?apest v?agr? ?nline !\nt?p 5 reasons :\nc?s...,1,ch apest v agr nline t p <NUM> reasons c sts <...
24023,lng update,we are continuing to work with the enron lng t...,0,we are continuing to work with the enron lng t...
8685,re : monday presentation,i have made one correction to guadalupe ' s de...,0,i have made one correction to guadalupe s degr...
31193,here you are,"increase your sperm volum , and orgzm duration...",1,increase your sperm volum and orgzm duration m...
5179,re : new color printer,"sorry ,\ndon ' t we need to know the cost , as...",0,sorry don t we need to know the cost as well f...
18754,,holiday specials on : tramadol - pain relief\n...,1,holiday specials on tramadol pain relief getti...
15818,fw : season ' s special,_ 80 % le . ss for . me ds !\ncopy and paste l...,1,<NUM> le ss for me ds copy and paste link belo...
8366,re : meeting re : wharton strategy,i am sorry . . . . . . as per message below we...,0,i am sorry as per message below we are changin...


In [130]:
phish_df = df[df['is_spam'] == 1]
safe_df  = df[df['is_spam'] == 0]

phish_df.sample(20)

Unnamed: 0,email_subject,email_body,is_spam,clean
29852,re : loonger,"hello , do you want to spend iess on your drru...",1,hello do you want to spend iess on your drrugt...
32459,contact me at mrs _ helenal @ hotmail . com,"dear friend ,\ni am mrs helena guei , widow to...",1,dear friend i am mrs helena guei widow to gene...
20968,"wow , its hard to get ahold of you . . .","if all pulled in one direction , the world wou...",1,if all pulled in one direction the world would...
29303,press release new years eve @ central electric...,os italianos marco carola e gaetano parisio en...,1,os italianos marco carola e gaetano parisio en...
16288,guia abierta = 7 . 200 . 000 datos completos ! !,responda a : guia _ telefonica 9 @ fullzero . ...,1,responda a guia telefonica <NUM> fullzero com ...
31054,,vicodin and other inexpnsive medlcations - no ...,1,vicodin and other inexpnsive medlcations no pr...
21149,get a free 6 . 3 megapixel canon digital camera !,>,1,>
4749,,to _ cc _ default _ handler\nsubject : [ spam ...,1,to cc default handler subject spam no prescrip...
10881,"no pills , no pumps - its the patch",penis growth extreme\nhttp : / / www . xunepa ...,1,penis growth extreme http www xunepa com ss de...
20651,windows xp + office xp for $ 80 . dispersal am...,"may lift , sky . sign , from grow . paint clos...",1,may lift sky sign from grow paint close long a...


In [131]:
df['is_spam'].value_counts(normalize=True)


is_spam
1    0.509269
0    0.490731
Name: proportion, dtype: float64

In [132]:
df.sample(10)


Unnamed: 0,email_subject,email_body,is_spam,clean
18869,you have to be kidding me .,this is a special advertisement .\n437 e 1000 ...,1,this is a special advertisement <NUM> e <NUM> ...
24900,very urgent,"attn / buyers ,\nmy name is mr fred . kamah ,\...",1,attn buyers my name is mr fred kamah i am a re...
21576,meet real people with horny desires,"thanks to cannylinguist , i read a poem ( in t...",1,thanks to cannylinguist i read a poem in trans...
26832,[ ilug ] the truth of internet marketing - - a...,we offer some of the best bulk e - mail prices...,1,we offer some of the best bulk e mail prices o...
20264,challenge expressions of hopelessness,"- - - - 03198118194477765\nhi varou ,\ni used ...",1,<NUM> hi varou i used it with food both times ...
22625,contractors that have worked 9 + months at enron,your hr generalist will be contacting you in t...,0,your hr generalist will be contacting you in t...
14907,the sale of the 2000 peakers to allegheny is c...,i am pleased to announce that yesterday ( may ...,0,i am pleased to announce that yesterday may <N...
32136,re : fda pharmacy tip . do you want the sickness,assist your daughter with her emotional issues...,1,assist your daughter with her emotional issues...
26464,great idea for you byrdshot,mortgage rates are about to rise\ncash in now ...,1,mortgage rates are about to rise cash in now o...
1164,entex delivery off of oasis,just a quick reminder that we need to cover th...,0,just a quick reminder that we need to cover th...


In [133]:
df.columns

Index(['email_subject', 'email_body', 'is_spam', 'clean'], dtype='object')

In [134]:
df = df.drop("email_body", axis=1)

In [135]:
df = df.drop("email_subject", axis=1)

In [136]:
df.sample(20)


Unnamed: 0,is_spam,clean
30955,1,microsoft windows xp professional <NUM> <NUM> ...
17666,0,the following name overlay was done today comi...
3431,0,see attached file hplno <NUM> xls hplno <NUM> xls
17020,0,start date <NUM> <NUM> <NUM> hourahead hour <N...
4704,1,e n o u g h ion marketing limitedd <NUM> <NUM>...
26142,1,microcap journal in july s issue we are going ...
31999,1,sundown cranky papa epitaxial guitar proof wor...
16273,1,re finance now even with bad credit best re fi...
12370,0,original message from taylor charles a hr sent...
4468,1,


In [137]:
len(df)

33715

In [138]:
df.duplicated(subset=['clean']).sum()

np.int64(5665)

In [139]:
df = df.drop_duplicates(subset=['clean'])

delete dupes (rows with same body text)

In [140]:
len(df)

28050

In [141]:
df.isnull().sum()

is_spam    0
clean      0
dtype: int64

In [142]:
df.sample(20)

Unnamed: 0,is_spam,clean
1181,0,i will have the ken seaman file completed by t...
26261,1,ebay auction news recommended resource special...
13210,0,so has the restraining order been lifted i hea...
3126,0,it s that time again crawfish are in season an...
730,0,daren just following up on my email and call l...
17909,0,i will forward you the appropriate operational...
7616,0,forwarded by stinson gibner hou ect on <NUM> <...
11564,0,stuart per our discussion of earlier this afte...
21905,1,microsoft windows xp professional <NUM> <NUM> ...
28654,0,february <NUM> <NUM> nesa members food for tho...


### Initial cleaning (clean column):
- Numbers, URLs, and emails are replaced with placeholders
- Lowercased text
- Removed HTML
- Removed punctuation
- Removed duplicate/nulls
- Normalized whitespace

In [143]:
df['is_spam'].value_counts(normalize=True)

is_spam
0    0.508734
1    0.491266
Name: proportion, dtype: float64

## Finalizing DS cleaning 

In [144]:
# Check missing values
print("Missing values per column:")
print(df.isnull().sum())

# Quick sanity check of text content
print("\nExample cleaned text:")
print(df['clean'].head(5))

# Check label balance
print("\nLabel distribution:")
print(df['is_spam'].value_counts(normalize=True))


Missing values per column:
is_spam    0
clean      0
dtype: int64

Example cleaned text:
0    just to update you on this project s status ba...
1    the above referenced meters need to be placed ...
2      see attached file hpll <NUM> xls hpll <NUM> xls
3    daren it s in bob forwarded by robert cotten h...
4    daren fyi bob forwarded by robert cotten hou e...
Name: clean, dtype: object

Label distribution:
is_spam
0    0.508734
1    0.491266
Name: proportion, dtype: float64


In [145]:
import nltk
from nltk.corpus import stopwords,  words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# download necessary data once
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # needed for lemmatizer
nltk.download('words')    # for english_vocab


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lle34\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lle34\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lle34\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lle34\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\lle34\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [146]:
import os

# Explicitly set NLTK's data directory to the one you downloaded to
nltk_data_dir = r"C:\Users\lle34\AppData\Roaming\nltk_data"
os.environ["NLTK_DATA"] = nltk_data_dir

# Add custom path to the search location
nltk.data.path.append(nltk_data_dir)

# Confirm
print("NLTK data paths:")
for p in nltk.data.path:
    print("  -", p)


NLTK data paths:
  - C:\Users\lle34/nltk_data
  - c:\Users\lle34\AppData\Local\Programs\Python\Python312\nltk_data
  - c:\Users\lle34\AppData\Local\Programs\Python\Python312\share\nltk_data
  - c:\Users\lle34\AppData\Local\Programs\Python\Python312\lib\nltk_data
  - C:\Users\lle34\AppData\Roaming\nltk_data
  - C:\nltk_data
  - D:\nltk_data
  - E:\nltk_data
  - C:\Users\lle34\AppData\Roaming\nltk_data
  - C:\Users\lle34\AppData\Roaming\nltk_data
  - C:\Users\lle34\AppData\Roaming\nltk_data
  - C:\Users\lle34\AppData\Roaming\nltk_data


In [147]:
nltk.download('punkt_tab', download_dir='C:/Users/lle34/AppData/Roaming/nltk_data')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:/Users/lle34/AppData/Roaming/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [148]:
print("stopwords loaded:", len(stopwords.words('english')))
print("tokenizer test:", word_tokenize("hello world!"))

stopwords loaded: 198
tokenizer test: ['hello', 'world', '!']


In [149]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
placeholders = ['<URL>', '<EMAIL>', '<NUM>']

def preprocess_text_2(text, min_words=1):
    # Protect placeholders
    placeholder_map = {}
    for i, ph in enumerate(placeholders):
        temp_token = f'XPLACEHOLDERX{i}X'
        placeholder_map[temp_token.lower()] = ph.lower()
        text = re.sub(re.escape(ph), temp_token, text, flags=re.IGNORECASE)

    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    processed_tokens = []
    for word in tokens:
        # Restore placeholders
        if word in placeholder_map:
            processed_tokens.append(placeholder_map[word])
        elif word.isalpha() and word not in stop_words:
            processed_tokens.append(lemmatizer.lemmatize(word))

    if len(processed_tokens) < min_words:
        return None

    return " ".join(processed_tokens)


In [150]:
# Test examples
sample_texts = [
    "Check out <URL> or email me at <EMAIL>.",
    "The quick brown fox jumps over the lazy dogs.",
    "Claim your prize now! <NUM> clicks only."
]

for text in sample_texts:
    processed = preprocess_text_2(text)
    print("Processed:", processed or "Skipped garbled text")

Processed: check <url> email <email>
Processed: quick brown fox jump lazy dog
Processed: claim prize <num> click


### Preprocessing_text_2/Final Cleaning
- Tokenizes text: Splits the text into individual words (tokens) for processing.
- Removes stopwords
- Lemmatizes words: Reduces words to their base forms
- Protects placeholders from being removed

Produces..
- A cleaned, tokenized, lemmatized string of text
- Keeps placeholders for feature recognition
- Removes irrelevant or noisy words (stopwords, very short tokens)

In [151]:
df.sample(20)

Unnamed: 0,is_spam,clean
9341,0,u urgent steve are you interested in speaking ...
11667,0,results recommendations when tuesday november ...
16886,0,dammit jim original message from porter david ...
11881,0,please find the following action items for the...
27506,1,useful for your individual and business invest...
28695,0,attached is the august <NUM> billing for park ...
14503,0,louise i am trying to prepare a set of contrac...
7188,0,dennis thanks for you message i shall send you...
11914,0,enron s sports marketing group is reviewing pr...
12773,0,attached is the revised <NUM> netco plan along...


The function takes a raw, cleaned email and produces a normalized version suitable for ML:

Lowercase, tokenized
Stopwords removed
Words lemmatized
Ready as one string per email

In [152]:
# Apply preprocessing to the 'clean' column
df['final_text'] = df['clean'].apply(preprocess_text_2)

# Quick check
df.head()


Unnamed: 0,is_spam,clean,final_text
0,0,just to update you on this project s status ba...,update project status based new report scott m...
1,0,the above referenced meters need to be placed ...,referenced meter need placed k please note inf...
2,0,see attached file hpll <NUM> xls hpll <NUM> xls,see attached file hpll <num> xl hpll <num> xl
3,0,daren it s in bob forwarded by robert cotten h...,daren bob forwarded robert cotten hou ect <num...
4,0,daren fyi bob forwarded by robert cotten hou e...,daren fyi bob forwarded robert cotten hou ect ...


In [153]:
df.sample(20)

Unnamed: 0,is_spam,clean,final_text
15343,1,market watch news flash we are fo owing the st...,market watch news flash fo owing strength last...
6067,0,dear dr kaminski this is quentin kerr from aus...,dear dr kaminski quentin kerr australia came b...
8017,0,john vince i really enjoyed the meeting the ot...,john vince really enjoyed meeting day broad cr...
29101,0,transwestern s average deliveries to californi...,transwestern average delivery california <num>...
4664,1,genuric ciilis apyalis at rock bottom prgces m...,genuric ciilis apyalis rock bottom prgces plac...
15305,1,hi regalis also known as superviagra or cialis...,hi regalis also known superviagra cialis half ...
16116,1,sister hat boys heaven draw broken begin ciali...,sister hat boy heaven draw broken begin cialii...
16103,1,dear customer thank you for your order your cr...,dear customer thank order credit card charged ...
11033,0,fyi forwarded by jean mrha na enron on <NUM> <...,fyi forwarded jean mrha na enron <num> <num> <...
2072,0,july <NUM> final nom <NUM> inlet hpl eastrans ...,july <num> final nom <num> inlet hpl eastrans ...


In [154]:
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
is_spam       0
clean         0
final_text    5
dtype: int64


In [155]:
print("NaNs:", df['final_text'].isnull().sum())
print("Empty strings:", (df['final_text'].str.strip() == "").sum())


NaNs: 5
Empty strings: 0


Need to remove empty strings, also read as NaN

In [156]:
# Remove empty strings/whitespace-only rows
df = df[df['final_text'].str.strip() != ""].reset_index(drop=True)

# Remove NaN in final_text col
df = df.dropna(subset=['final_text'])



In [157]:
print(df.isnull().sum())


is_spam       0
clean         0
final_text    0
dtype: int64


In [158]:
df = df[['is_spam', 'final_text']]

In [159]:
df.to_csv("enron_1.csv", index=False)

Download cleaned dataset