In [1]:
import pandas as pd

df = pd.read_csv("../datasets/datasets/enron_spam_dataset.csv")

In [2]:
df.head(20)

Unnamed: 0,email_subject,email_body,is_spam
0,ena sales on hpl,just to update you on this project ' s status ...,0
1,98 - 6736 & 98 - 9638 for 1997 ( ua 4 issues ),the above referenced meters need to be placed ...,0
2,"hpl nominations for december 28 , 1999",( see attached file : hpll 228 . xls )\n- hpll...,0
3,revised nom - kcs resources,"daren ,\nit ' s in .\nbob\n- - - - - - - - - -...",0
4,new production - sitara deals needed,"daren ,\nfyi .\nbob\n- - - - - - - - - - - - -...",0
5,re : another hesco issue,help . steve mauch at hesco is wanting an answ...,0
6,"enron / hpl actuals for august 16 , 2000",teco tap 120 . 000 / hpl iferc ; 20 . 000 / en...,0
7,meter 1031 baytown exxon,daren - the valve for meter 1031 was not shut ...,0
8,ces deal clean - up,i will need to make these changes in sitara . ...,0
9,meter 5097,"daren ,\ndo you know if there should be a deal...",0


In [3]:
len(df)

33715

In [4]:
df.duplicated().sum()


np.int64(3222)

In [5]:
df.duplicated(subset=['email_body']).sum()


np.int64(3936)

In [6]:
df.isnull().sum()

email_subject    289
email_body       371
is_spam            0
dtype: int64

In [7]:
import re

def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'http\S+', ' ', s)       # remove URLs
    s = re.sub(r'[^a-z\s]', ' ', s)      # remove punctuation/numbers
    s = re.sub(r"<.*?>", "", s)              # remove HTML tags
    s = re.sub(r'\s+', ' ', s).strip()   # normalize spaces
    return s

df['clean'] = df['email_body'].apply(clean_text)


In [8]:
phish_df = df[df['is_spam'] == 1]
safe_df  = df[df['is_spam'] == 0]

df.head()

Unnamed: 0,email_subject,email_body,is_spam,clean
0,ena sales on hpl,just to update you on this project ' s status ...,0,just to update you on this project s status ba...
1,98 - 6736 & 98 - 9638 for 1997 ( ua 4 issues ),the above referenced meters need to be placed ...,0,the above referenced meters need to be placed ...
2,"hpl nominations for december 28 , 1999",( see attached file : hpll 228 . xls )\n- hpll...,0,see attached file hpll xls hpll xls
3,revised nom - kcs resources,"daren ,\nit ' s in .\nbob\n- - - - - - - - - -...",0,daren it s in bob forwarded by robert cotten h...
4,new production - sitara deals needed,"daren ,\nfyi .\nbob\n- - - - - - - - - - - - -...",0,daren fyi bob forwarded by robert cotten hou e...


In [9]:
phishing_terms = [
     # Urgency / threat
    'urgent', 'immediately', 'suspend', 'expire', 'limited', 'attention', 'action required', 'unauthorized',

    # Account / login / credentials
    'verify account', 'reset password', 'login', 'confirm identity', 'unlock', 'access', 'account verification', 'credentials',

    # Call to action
    'click link', 'update information', 'download', 'submit', 'authorize', 'validate', 'approve', 'review',

    # Financial / sensitive
    'payment', 'invoice', 'billing', 'transaction', 'bank', 'credit card', 'security alert', 'personal information',

    # Phishing tactics / social engineering
    'customer service', 'support', 'notification', 'request', 'account alert', 'confirm details', 'verify identity'
]

In [10]:
phish_text = " ".join(df.loc[df['is_spam'] == 1, 'clean'])
safe_text  = " ".join(df.loc[df['is_spam'] == 0, 'clean'])


In [11]:
# Count total keyword occurrences across all phishing terms
phish_total = sum(phish_text.count(term) for term in phishing_terms)
safe_total  = sum(safe_text.count(term) for term in phishing_terms)

# Print the quick comparison
print(f"Total phishing keyword hits in PHISHING emails: {phish_total}")
print(f"Total phishing keyword hits in SAFE emails:     {safe_total}")

if safe_total > 0:
    ratio = phish_total / safe_total
    print(f"Phishing emails contain ~{ratio:.1f}× more phishing keywords.")
else:
    print("No phishing keywords found in safe emails.")

Total phishing keyword hits in PHISHING emails: 19780
Total phishing keyword hits in SAFE emails:     26963
Phishing emails contain ~0.7× more phishing keywords.


In [12]:
df['is_spam'].value_counts(normalize=True)


is_spam
1    0.509269
0    0.490731
Name: proportion, dtype: float64

In [13]:
df.sample(10)


Unnamed: 0,email_subject,email_body,is_spam,clean
27935,capacity requests for tw,"effective immediately , the contracts group wi...",0,effective immediately the contracts group will...
12172,key issues,"pat ,\nattached is a summary of key issues aff...",0,pat attached is a summary of key issues affect...
12960,enron mentions,"blowback\nforbes , 11 - 12 - 01\nenron shares ...",0,blowback forbes enron shares rise rebounding f...
890,follow - up,just following up to my week ago note .\nrest ...,0,just following up to my week ago note rest ass...
24931,buy viagra online ! it ' s your best way to bu...,get yourself back on track today !\nshe ' s th...,1,get yourself back on track today she s the onl...
17095,fw : epe lending / cali short for mon .,day - ahead spl 5 on peak price for monday is ...,0,day ahead spl on peak price for monday is orig...
29576,fw : proceed with your ordination,fw : be ordained now !\nbecome a\nlegally orda...,1,fw be ordained now become a legally ordained m...
7352,here ' s your chance,what do we need to know to make ebs a successf...,0,what do we need to know to make ebs a successf...
26008,justt try lt,"hello , welcome to pharmon emphases line sh pu...",1,hello welcome to pharmon emphases line sh pude...
21373,windows + offlce at half prlce,new windows software s @ le\nwe have al | your...,1,new windows software s le we have al your favo...


In [14]:
print(df.columns.tolist())


['email_subject', 'email_body', 'is_spam', 'clean']


In [15]:
df = df.drop("email_body", axis=1)

In [16]:
df = df.drop("email_subject", axis=1)

In [17]:
df.sample(20)


Unnamed: 0,is_spam,clean
10122,1,this message was created automatically by mail...
32737,1,mnei the best sma cap stock in oo just keep re...
28534,0,listed below are the items cambridge energy di...
11647,0,acctg told me this am that they already booked...
24014,0,hello thank you in advance for taking the time...
14435,0,there was a discussion on this morning s call ...
32070,1,ozone athenian asperity lineal inculpable are ...
26650,1,pop media corp popt a company which has positi...
5126,1,our us licensed doctors will prescribes your m...
22277,1,


In [18]:
len(df)

33715

In [19]:
df.duplicated(subset=['clean']).sum()

np.int64(5699)

In [20]:
df = df.drop_duplicates(subset=['clean'])

delete dupes (rows with same body text)

In [21]:
len(df)

28016

In [22]:
df.isnull().sum()

is_spam    0
clean      0
dtype: int64

In [23]:
df.sample(20)

Unnamed: 0,is_spam,clean
6351,0,vince many thanks for the invitation i m leavi...
9844,1,lt is really hard to recollect a company the m...
32187,1,sa ve on r x medicat ion today orde r all your...
28334,0,hub forwarded by lorna brennan et s enron on p...
8008,0,hi vince i just wanted to thank you for the op...
10294,1,copy dvd movies yes copy and burn your own dvd...
7567,0,praveen mellacheruvu enron com you have receiv...
12178,0,please find the attached file for the roll for...
20926,1,seiko smart pad ii seiko smart pad ii part sp ...
26524,1,in a monastery in thailand was being relocated...


In [24]:
df['is_spam'].value_counts(normalize=True)

is_spam
0    0.508602
1    0.491398
Name: proportion, dtype: float64

In [None]:
len(df)