# Prepare.ipynb

In [5]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import random
random.seed(2023)
import seaborn as sns
import matplotlib.pyplot as plt
import re

%matplotlib inline

nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load the Data and Initial Exploration

In [6]:
df = pd.read_csv("emails.csv")  # Original file: "emails.csv"

print("First 5 rows of the dataset:")
display(df.head())

print("\nColumns in the dataset:", df.columns.tolist())
print("Data Shape:", df.shape)

print("\nSample distribution of the 'spam' column:")
print(df['spam'].value_counts())

num_spam = df[df['spam'] == 1].shape[0]
num_ham  = df[df['spam'] == 0].shape[0]
pct_spam = (num_spam / (num_spam + num_ham)) * 100
pct_ham  = (num_ham / (num_spam + num_ham)) * 100

print(f"\nPercentage of spam emails = {pct_spam:.2f}%")
print(f"Percentage of ham emails  = {pct_ham:.2f}%")

print("\nStatistical description of 'spam' column:")
display(df[['spam']].describe())

First 5 rows of the dataset:


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1



Columns in the dataset: ['text', 'spam']
Data Shape: (5728, 2)

Sample distribution of the 'spam' column:
spam
0    4360
1    1368
Name: count, dtype: int64

Percentage of spam emails = 23.88%
Percentage of ham emails  = 76.12%

Statistical description of 'spam' column:


Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


# Check the Structure of Messages

In [7]:
print("Verifying if each message starts with 'Subject: '")

pattern = r"^Subject:\s"
count_subject_form = 0

for idx in range(df.shape[0]):
    found = re.search(pattern, df.iloc[idx, 0])
    if found:
        count_subject_form += 1

if count_subject_form == df.shape[0]:
    print("All entries begin with 'Subject: '")
else:
    print("Some entries do NOT begin with 'Subject: '")

Verifying if each message starts with 'Subject: '
All entries begin with 'Subject: '


# Text Preprocessing

In [8]:
def clean_email_content(mail_text):
    """
    Removes 'Subject: ' prefix, converts to lowercase,
    strips non-alphabetic characters (except spaces and periods),
    and excludes common stopwords.
    """
    # Convert text to lowercase
    mail_text = mail_text.lower()

    # Remove 'Subject: ' prefix if present
    prefix_pattern = r"^subject:\s(.*)"
    match_subject = re.search(prefix_pattern, mail_text)
    if match_subject:
        mail_text = match_subject.group(1)

    # Retain only letters (a-z), periods, and spaces
    mail_text = re.sub(r"[^a-z .]", "", mail_text)

    # Tokenize
    tokens = mail_text.split()

    # Filter out stopwords and non-alpha tokens
    tokens = [tok for tok in tokens
              if tok.isalpha() and tok not in stopwords.words('english')]

    # Join back into a single string
    return " ".join(tokens)

print("\nBeginning the preprocessing step:\n")

df["cleaned_text"] = ""

for i in range(df.shape[0]):
    # Print progress updates
    if i % 500 == 0 and i != 0:
        percent_done = round(i / df.shape[0] * 100)
        blocks_filled = (percent_done // 10) * 4
        blocks_empty = 40 - blocks_filled
        print("+"*blocks_filled + "-"*blocks_empty + f" : {percent_done}% processed")

    df.at[i, "cleaned_text"] = clean_email_content(df.at[i, "text"])

# Final completion message
print("+"*40 + " : 100% processed\n")
print("Text cleaning finished.\n")


Beginning the preprocessing step:

---------------------------------------- : 9% processed
++++------------------------------------ : 17% processed
++++++++-------------------------------- : 26% processed
++++++++++++---------------------------- : 35% processed
++++++++++++++++------------------------ : 44% processed
++++++++++++++++++++-------------------- : 52% processed
++++++++++++++++++++++++---------------- : 61% processed
++++++++++++++++++++++++++++------------ : 70% processed
++++++++++++++++++++++++++++------------ : 79% processed
++++++++++++++++++++++++++++++++-------- : 87% processed
++++++++++++++++++++++++++++++++++++---- : 96% processed
++++++++++++++++++++++++++++++++++++++++ : 100% processed

Text cleaning finished.



# Quick Glance at the Processed

In [9]:
display(df.head(10))

Unnamed: 0,text,spam,cleaned_text
0,Subject: naturally irresistible your corporate...,1,naturally irresistible corporate identity lt r...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill muzo co...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy im wanting sh...
3,Subject: 4 color printing special request add...,1,color printing special request additional info...
4,"Subject: do not have money , get software cds ...",1,money get software cds software compatibility ...
5,"Subject: great nnews hello , welcome to medzo...",1,great nnews hello welcome medzonline sh ground...
6,Subject: here ' s a hot play in motion homela...,1,hot play motion homeland security investments ...
7,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...
8,Subject: undeliverable : home based business f...,1,undeliverable home based business grownups mes...
9,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...


# Splitting into Train/Val/Test Set

In [10]:
# 70% -> train, 15% -> validation, 15% -> test
train_data, remaining_data = train_test_split(
    df[["cleaned_text", "spam"]],
    test_size=0.30,
    random_state=2023
)

val_data, test_data = train_test_split(
    remaining_data,
    test_size=0.50,
    random_state=2023
)

print("Training set size   :", train_data.shape)
print("Validation set size :", val_data.shape)
print("Test set size       :", test_data.shape)

Training set size   : (4009, 2)
Validation set size : (859, 2)
Test set size       : (860, 2)


# Save the Splits as CSV Files

In [11]:
train_data.to_csv("train_rephrased.csv", index=False)
val_data.to_csv("val_rephrased.csv", index=False)
test_data.to_csv("test_rephrased.csv", index=False)

print("\nDatasets saved as:")
print("  - train_rephrased.csv")
print("  - val_rephrased.csv")
print("  - test_rephrased.csv")


Datasets saved as:
  - train_rephrased.csv
  - val_rephrased.csv
  - test_rephrased.csv
