In [22]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack


In [23]:
df = pd.read_csv("../enron_1.csv")

In [24]:
# 1. Check for NaNs in the processed text
num_nans = df['final_text'].isnull().sum()
print("NaNs in final_text:", num_nans)

# 2. Check for empty strings (after stripping whitespace)
num_empty = (df['final_text'].str.strip() == '').sum()
print("Empty strings in final_text:", num_empty)

# 3. Check for placeholder presence
placeholders = ['<url>', '<email>', '<num>']
placeholder_counts = {ph: df['final_text'].str.contains(ph, case=False).sum() for ph in placeholders}
print("Placeholder counts:", placeholder_counts)


NaNs in final_text: 1
Empty strings in final_text: 0
Placeholder counts: {'<url>': 49, '<email>': 0, '<num>': 22882}


### Split Data into Train/Test

In [25]:
print(df.isnull().sum())


is_spam       0
final_text    1
dtype: int64


In [26]:
df = df.dropna(subset=['final_text']) # Drop rows that are NaN

df = df[~df['final_text'].str.replace(r'\s+', '', regex=True).eq('')] # Drop rows that are empty or just whitespace

df = df.reset_index(drop=True)


In [27]:
print(df.isnull().sum())


is_spam       0
final_text    0
dtype: int64


In [28]:
# Keep only rows with actual text
df = df[df['final_text'].str.strip() != ""]  # remove empty strings
df = df.reset_index(drop=True)               # optional, reset row numbers


In [29]:
X = df['final_text']  # preprocessed email text
y = df['is_spam']     # labels: 0 = ham, 1 = spam


In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,       # 20% of data for testing
    random_state=42,     # ensures reproducibility
    stratify=y           # preserves the spam/ham ratio in train/test
)


In [31]:
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
print("\nTrain label distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest label distribution:")
print(y_test.value_counts(normalize=True))


Train size: 22435, Test size: 5609

Train label distribution:
is_spam
0    0.508759
1    0.491241
Name: proportion, dtype: float64

Test label distribution:
is_spam
0    0.508825
1    0.491175
Name: proportion, dtype: float64


## Data Engineering/Extraction

### TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,    # limit vocabulary size, 5000 features (these features are the most “important” words or n-grams from your corpus)
    ngram_range=(1, 2),   # unigrams + bigrams
    stop_words='english'  # skip common words
)

X_tfidf = tfidf.fit_transform(df['final_text'].dropna())


TF-IDF extracts each unique word (or n-gram), builds a vocabulary

In [33]:
print(tfidf.get_feature_names_out())


['aa' 'ab' 'ability' ... 'zone' 'zone www' 'zonedubai']


builds/returns a vocabulary of all unique words (or n-grams) it learned from your corpus.

above shows only the words that actually appear in the first email, along with their importance scores.

In [None]:
print(X_tfidf.shape)

(28044, 5000)


### Inspect / Analyze TF-IDF

In [46]:
vocab = tfidf.get_feature_names_out()
print(len(vocab))        # should be ≤ max_features
print(vocab[:50])        # preview first 20 words

5000
['aa' 'ab' 'ability' 'able' 'abroad' 'absence' 'absolutely' 'abuse' 'ac'
 'academic' 'accept' 'acceptable' 'acceptance' 'acceptance term'
 'accepted' 'access' 'accessory' 'accomplish' 'accordance' 'according'
 'accordingly' 'account' 'accountant' 'accounting' 'accounting investment'
 'accuracy' 'accuracy completeness' 'accurate' 'achieve' 'achievement'
 'acquire' 'acquired' 'acquiring' 'acquisition' 'acre' 'acrobat'
 'acrobat num' 'act' 'act num' 'action' 'action identified' 'action occur'
 'active' 'activity' 'actua' 'actual' 'actual result' 'actually' 'acy'
 'acy ne']


Above checks TF-IDF vocabulary

In [71]:
i = 0  # first email
df_single = pd.DataFrame(X_tfidf[i].T.todense(), index=tfidf.get_feature_names_out(), columns=["tfidf"])
print(df_single[df_single.tfidf > 0]) #Filter to only non-zero TF-IDF scores


               tfidf
accomplish  0.058466
activity    0.042010
advance     0.048357
agreement   0.037281
allow       0.042633
...              ...
utility     0.091012
volume      0.037022
waiting     0.046907
wondering   0.058271
working     0.034776

[140 rows x 1 columns]


In [47]:
sparsity = 1.0 - (X_tfidf.count_nonzero() / (X_tfidf.shape[0] * X_tfidf.shape[1]))
print(f"Sparsity: {sparsity:.2%}")


Sparsity: 98.62%


Out of all the entries in your TF-IDF matrix, 98.62% are zeros.
Only 1.38% of the cells have a non-zero TF-IDF value.