In [1]:
# Setup and imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string
import warnings
warnings.filterwarnings('ignore')

# Text preprocessing
import nltk
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

from nltk.corpus import stopwords
from collections import Counter

# Plot settings
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Setup complete")

Setup complete


### Step 1 : Load and Explore Dataset

In [2]:
# Load dataset
print("Loading spam dataset...")
df = pd.read_csv("Datasets/Spam_Ham_Dataset.csv")

# Clean data
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)
df.columns = ['text', 'label']

# Remove empty emails
df = df[df['text'].str.len() > 10]
df = df[df['text'] != 'empty']
df = df.dropna()

print(f"Dataset loaded: {len(df):,} emails")
print(f"Spam: {sum(df['label']==1):,} ({sum(df['label']==1)/len(df)*100:.1f}%)")
print(f"Ham: {sum(df['label']==0):,} ({sum(df['label']==0)/len(df)*100:.1f}%)")

# Show basic info
df.head()

Loading spam dataset...
Dataset loaded: 5,508 emails
Spam: 1,556 (28.2%)
Ham: 3,952 (71.8%)


Unnamed: 0,text,label
0,\r\nSave up to 70% on Life Insurance.\r\nWhy S...,1
1,1) Fight The Risk of Cancer!\r\nhttp://www.adc...,1
2,1) Fight The Risk of Cancer!\r\nhttp://www.adc...,1
3,##############################################...,1
4,I thought you might like these:\r\n1) Slim Dow...,1


In [3]:
# Show sample emails
print("Sample Emails:")
print("-" * 50)

print("\nSPAM Examples:")
spam_samples = df[df['label'] == 1]['text'].head(3)
for i, email in enumerate(spam_samples, 1):
    print(f"{i}. {email[:100]}...")

print("\nHAM Examples:")
ham_samples = df[df['label'] == 0]['text'].head(3)
for i, email in enumerate(ham_samples, 1):
    print(f"{i}. {email[:100]}...")

Sample Emails:
--------------------------------------------------

SPAM Examples:
1. 
Save up to 70% on Life Insurance.
Why Spend More Than You Have To?Life Quote Savings
Ensuring yo...
2. 1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=315&s=pk0072) Slim Down - Guaranteed to ...
3. 1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=315&s=pk0072) Slim Down - Guaranteed to ...

HAM Examples:
1.     Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues 
    Message-ID:...
2. Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limesto...
3. Man Threatens Explosion In Moscow Thursday August 22, 2002 1:40 PM
MOSCOW (AP) - Security officers ...


### Step 2: Text Preprocessing

In [4]:
# Text preprocessing function
def preprocess_text(text):
    """
    Clean email text for analysis.
    """
    text = str(text).lower()
    
    # Remove URLs and email addresses
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove numbers and punctuation
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words and len(word) > 2]
    
    return ' '.join(words)

# Preprocess all emails
print("Preprocessing text...")
df['clean_text'] = df['text'].apply(preprocess_text)

# Show preprocessing example
sample_email = df['text'].iloc[0]
cleaned_email = df['clean_text'].iloc[0]

print("\nPreprocessing Example:")
print(f"Original: {sample_email[:200]}...")
print(f"Cleaned:  {cleaned_email[:200]}...")

Preprocessing text...

Preprocessing Example:
Original: 
Save up to 70% on Life Insurance.
Why Spend More Than You Have To?Life Quote Savings
Ensuring your 
      family's financial security is very important. Life Quote Savings makes 
      buying li...
Cleaned:  save life insurance spend tolife quote savings ensuring familys financial security important life quote savings makes buying life insurance simple affordable provide free access best companies lowest ...


### Step 3: Feature Extraction and Data Splitting

In [5]:
# Use subset for faster processing
sample_size = 5000
df_sample = df.sample(n=min(sample_size, len(df)), random_state=42)
print(f"Using {len(df_sample):,} emails for training")

# Extract TF-IDF features
print("Extracting TF-IDF features...")
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

X = vectorizer.fit_transform(df_sample['clean_text'])
y = df_sample['label'].values

print(f"Feature matrix: {X.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training: {X_train.shape[0]:,} emails")
print(f"Testing: {X_test.shape[0]:,} emails")

Using 5,000 emails for training
Extracting TF-IDF features...
Feature matrix: (5000, 5000)
Training: 4,000 emails
Testing: 1,000 emails
