In [1]:
# This script is for loading the CEAS_08 dataset into a pandas DataFrame.

# import necessary libraries
import pandas as pd

# Load the CEAS_08 dataset
file_path = "../data/raw/CEAS_08.csv"  # Adjust if needed depending on where your notebook is

# Read the CSV file
df = pd.read_csv(file_path)

# Show the basic shape of the data (rows, columns)
print(f"Dataset shape: {df.shape}")

# Show the first 5 rows to preview the structure
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/CEAS_08.csv'

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display the missing values
print("Missing values per column:")
print(missing_values)

Missing values per column:
sender        0
receiver    462
date          0
subject      28
body          0
label         0
urls          0
dtype: int64


In [None]:
# Fill missing subject lines with a placeholder
df['subject'].fillna('[No Subject]', inplace=True)

# For now, leave receiver as it is (optional: we might drop receiver column later if not useful)

# Confirm no missing values remain (except receiver)
print("Missing values after cleaning:")
print(df.isnull().sum())

Missing values after cleaning:
sender        0
receiver    462
date          0
subject       0
body          0
label         0
urls          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['subject'].fillna('[No Subject]', inplace=True)


In [None]:
# Check unique values in label column
print("Unique values in 'label':")
print(df['label'].value_counts())

# Check basic statistics of the URLs column
print("\nURLs column statistics:")
print(df['urls'].describe())

Unique values in 'label':
label
1    21842
0    17312
Name: count, dtype: int64

URLs column statistics:
count    39154.00000
mean         0.66997
std          0.47023
min          0.00000
25%          0.00000
50%          1.00000
75%          1.00000
max          1.00000
Name: urls, dtype: float64


In [None]:
# Check if there are any missing values in the date column
missing_dates = df['date'].isnull().sum()
print(f"Missing date values: {missing_dates}")

# Show a few sample dates to manually inspect their formatting
print("\nSample dates:")
print(df['date'].sample(5))

# Try converting the 'date' column to datetime format to see if all dates are parsable
# This will help us catch badly formatted dates
df['parsed_date'] = pd.to_datetime(df['date'], errors='coerce')

# Now check how many parsing errors happened (they will be NaT)
bad_dates = df['parsed_date'].isnull().sum()
print(f"\nUnparsable date entries: {bad_dates}")

Missing date values: 0

Sample dates:
28320    Thu, 07 Aug 2008 12:22:15 +0600
14853    Wed, 06 Aug 2008 12:10:10 +0500
23240    Thu, 07 Aug 2008 07:04:19 -0300
28008    Thu, 07 Aug 2008 12:04:12 +0600
8886     Wed, 06 Aug 2008 02:23:41 -0400
Name: date, dtype: object

Unparsable date entries: 15


  df['parsed_date'] = pd.to_datetime(df['date'], errors='coerce')


In [None]:
# Drop rows where parsed_date could not be parsed
df = df.dropna(subset=['parsed_date'])

# Confirm new shape
print(f"Dataset shape after dropping bad dates: {df.shape}")

Dataset shape after dropping bad dates: (39139, 8)


In [None]:
# Lowercase the text fields
df['subject'] = df['subject'].str.lower()
df['body'] = df['body'].str.lower()

# Quick check
print(df[['subject', 'body']].head())

                                             subject  \
0                          never agree to be a loser   
1                             befriend jenna jameson   
2                               cnn.com daily top 10   
3  re: svn commit: r619753 - in /spamassassin/tru...   
4                         specialpricespharmmoreinfo   

                                                body  
0  buck up, your troubles caused by small dimensi...  
1  \nupgrade your sex and pleasures with these te...  
2  >+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...  
3  would anyone object to removing .so from this ...  
4  \nwelcomefastshippingcustomersupport\nhttp://7...  


In [None]:
# Save the cleaned data to a new CSV
df.to_csv("../data/processed/CEAS_08_cleaned.csv", index=False)