In [28]:
import csv
import re
from collections import Counter
import pandas as pd

In [29]:
def load_and_split_csv(filename):
  ham_data = []
  spam_data = []

  with open(filename, newline = '', encoding = 'latin1') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)

    for row in reader:
      label, text = row[0].strip().lower(), row[1].strip()
      if 'ham' in label:
        ham_data.append(text)
      elif 'spam' in label:
        spam_data.append(text)

  return ham_data, spam_data

In [30]:
def parse_word_frequencies(text_list):
    word_counter = Counter()

    for text in text_list:
        words = re.findall(r"\b\w+\b", text.lower())
        word_counter.update(words)

    return dict(word_counter)

In [31]:
##### Main #####

# Loads the csv and splits it into a ham dataset and a spam dataset.
ham, spam = load_and_split_csv("spam.csv")

In [32]:
# Get the number of ham and the number of spam messages so we can adjust for frequency.
ham_size = len(ham)
spam_size = len(spam)

# Counts the frequency of each word in ham and spam datasets. The first column is the word, and the second is the count.
# Converts everything to lower case so capitalization does not matter.
ham_parsed = parse_word_frequencies(ham)
spam_parsed = parse_word_frequencies(spam)

In [33]:
# Combine all unique words from both sets
all_words = set(ham_parsed.keys()) | set(spam_parsed.keys())

# Build dataset rows
rows = []
for word in all_words:
    ham_count = ham_parsed.get(word, 0)
    spam_count = spam_parsed.get(word, 0)

    ham_freq = ham_count / ham_size if ham_size > 0 else 0
    spam_freq = spam_count / spam_size if spam_size > 0 else 0

    # A simple spam-likelihood score (higher â†’ more spammy)
    spam_likelihood = (spam_freq + 1e-9) / (ham_freq + spam_freq + 1e-9)

    rows.append([word, ham_count, spam_count, ham_freq, spam_freq, spam_likelihood])

word_dataset = pd.DataFrame(
    rows,
    columns=["word", "ham_count", "spam_count", "ham_freq", "spam_freq", "spam_likelihood"]
)

# Sort by how strongly a word is associated with spam
word_dataset_sorted = word_dataset.sort_values("spam_likelihood", ascending=False)

word_dataset_sorted.head(20)

Unnamed: 0,word,ham_count,spam_count,ham_freq,spam_freq,spam_likelihood
8710,w14rg,0,1,0.0,0.001339,1.0
8707,winnersclub,0,1,0.0,0.001339,1.0
8688,tkls,0,1,0.0,0.001339,1.0
8683,09061743811,0,1,0.0,0.001339,1.0
8682,09050000460,0,1,0.0,0.001339,1.0
8679,674,0,2,0.0,0.002677,1.0
8678,800,0,22,0.0,0.029451,1.0
33,mobilesdirect,0,3,0.0,0.004016,1.0
26,peak,0,2,0.0,0.002677,1.0
24,toppoly,0,1,0.0,0.001339,1.0
