In [1]:
# 01-data-exploration.ipynb
# Data Exploration: PhishTank & Enron

"""
## 01 - Data Exploration

This notebook performs initial exploration of the raw datasets:

1. Load PhishTank phishing URL dump.
2. Load Enron email dataset (parsed CSV).
3. Inspect basic stats (counts, missing values).
4. Quick EDA on URL lengths, creation dates distribution, sender domains.
5. Save summary CSVs for feature development.
"""

#%%
# 1. Imports and Configuration
# Ensure required plotting package is available in the notebook environment
%pip install --quiet matplotlib

import os
import pandas as pd
import matplotlib.pyplot as plt

# Set data paths
PHISHANK_CSV = os.path.join('..','data','raw','phishtank_urls.csv')
ENRON_CSV    = os.path.join('..','data','raw','enron_emails.csv')  # after parse_enron

#%%
# 2. Load Data
phish_df = pd.read_csv(PHISHANK_CSV)
enron_df = pd.read_csv(ENRON_CSV)

print(f"Phishing URLs: {len(phish_df)} rows")
print(f"Enron emails:   {len(enron_df)} rows")

#%%
# 3. Basic Stats
# Missing values
print("PhishTank Missing: \n", phish_df.isnull().sum())
print("Enron Missing: \n", enron_df.isnull().sum())

# Unique senders in Enron
print("Unique senders:", enron_df['sender'].nunique())

#%%
# 4. EDA - URL length distribution
phish_df['url_length'] = phish_df['url'].str.len()
plt.hist(phish_df['url_length'], bins=50)
plt.title('Distribution of Phishing URL Lengths')
plt.xlabel('Length')
plt.ylabel('Count')
plt.show()

#%%
# 5. EDA - Sender domain frequency
# extract domain from 'sender'
enron_df['sender_domain'] = enron_df['sender'].str.extract(r"@(.*)")
top_senders = enron_df['sender_domain'].value_counts().head(20)
top_senders.plot(kind='barh')
plt.title('Top 20 Sender Domains in Enron Corpus')
plt.xlabel('Email Count')
plt.show()

#%%
# 6. Save summaries for feature engineering
phish_df[['url','url_length']].to_csv(os.path.join('..','data','processed','phish_url_summary.csv'), index=False)
enron_df[['sender','sender_domain']].to_csv(os.path.join('..','data','processed','enron_senders.csv'), index=False)


Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'pandas'