In [64]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack


In [65]:
df = pd.read_csv("../enron_1.csv")

Split Data into Train/Test

In [67]:
print(df.isnull().sum())


is_spam       0
final_text    1
dtype: int64


In [75]:
df = df.dropna(subset=['final_text']) # Drop rows that are NaN

df = df[~df['final_text'].str.replace(r'\s+', '', regex=True).eq('')] # Drop rows that are empty or just whitespace

df = df.reset_index(drop=True)


In [76]:
print(df.isnull().sum())


is_spam       0
final_text    0
dtype: int64


In [77]:
# Keep only rows with actual text
df = df[df['final_text'].str.strip() != ""]  # remove empty strings
df = df.reset_index(drop=True)               # optional, reset row numbers


In [78]:
X = df['final_text']  # preprocessed email text
y = df['is_spam']     # labels: 0 = ham, 1 = spam


In [70]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,       # 20% of data for testing
    random_state=42,     # ensures reproducibility
    stratify=y           # preserves the spam/ham ratio in train/test
)


In [79]:
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
print("\nTrain label distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest label distribution:")
print(y_test.value_counts(normalize=True))


Train size: 22408, Test size: 5603

Train label distribution:
is_spam
0    0.508613
1    0.491387
Name: proportion, dtype: float64

Test label distribution:
is_spam
0    0.508656
1    0.491344
Name: proportion, dtype: float64
