In [12]:
import csv
import re
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [13]:
def load_and_split_csv(filename):
  ham_data = []
  spam_data = []

  with open(filename, newline = '', encoding = 'latin1') as csvfile:
    reader = csv.reader(csvfile)
    next(reader, None)

    for row in reader:
      label, text = row[0].strip().lower(), row[1].strip()
      if 'ham' in label:
        ham_data.append(text)
      elif 'spam' in label:
        spam_data.append(text)

  return ham_data, spam_data

In [14]:
def parse_word_frequencies(text_list):
    word_counter = Counter()

    for text in text_list:
        words = re.findall(r"\b\w+\b", text.lower())
        word_counter.update(words)

    return dict(word_counter)

In [15]:
##### Main #####

# Loads the csv and splits it into a ham dataset and a spam dataset.
ham, spam = load_and_split_csv("spam.csv")

In [16]:
# Get the number of ham and the number of spam messages so we can adjust for frequency.
ham_size = len(ham)
spam_size = len(spam)

# Counts the frequency of each word in ham and spam datasets. The first column is the word, and the second is the count.
# Converts everything to lower case so capitalization does not matter.
ham_parsed = parse_word_frequencies(ham)
spam_parsed = parse_word_frequencies(spam)

In [17]:
# Combine all unique words from both sets
all_words = set(ham_parsed.keys()) | set(spam_parsed.keys())

# Build dataset rows
rows = []
for word in all_words:
    ham_count = ham_parsed.get(word, 0)
    spam_count = spam_parsed.get(word, 0)

    ham_freq = ham_count / ham_size if ham_size > 0 else 0
    spam_freq = spam_count / spam_size if spam_size > 0 else 0

    # A simple spam-likelihood score (higher â†’ more spammy)
    spam_likelihood = (spam_freq + 1e-9) / (ham_freq + spam_freq + 1e-9)

    rows.append([word, ham_count, spam_count, ham_freq, spam_freq, spam_likelihood])

word_dataset = pd.DataFrame(
    rows,
    columns=["word", "ham_count", "spam_count", "ham_freq", "spam_freq", "spam_likelihood"]
)

# Sort by how strongly a word is associated with spam
word_dataset_sorted = word_dataset.sort_values("spam_likelihood", ascending=False)

In [18]:
# Create train and test datasets
word_dataset_sorted['label'] = (word_dataset_sorted['spam_count'] > word_dataset_sorted['ham_count']).astype(int)
feature_cols = ["ham_freq", "spam_freq", "ham_count", "spam_count"]
X = word_dataset_sorted[feature_cols]
y = word_dataset_sorted['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Train SVM
svm_model = Pipeline([("scaler", StandardScaler()), ("svm", SVC(kernel="rbf", probability=True))])
svm_model.fit(X_train, y_train)

# Evaluate SVM
y_pred = svm_model.predict(X_test)
print("\nSVM Accuracy: {:.3f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["ham", "spam"]))

# Tie SVM predictions back to dataset
word_dataset_sorted["svm_pred"] = svm_model.predict(X)
word_dataset_sorted["svm_spam_probability"] = svm_model.predict_proba(X)[:,1]

# Printout of the most likely spam words
svm_sorted_words = word_dataset_sorted.sort_values("svm_spam_probability", ascending=False)
svm_sorted_words[["word", "svm_spam_probability"]].head(20)


SVM Accuracy: 98.164%

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.98      0.99      1325
        spam       0.93      1.00      0.96       418

    accuracy                           0.98      1743
   macro avg       0.96      0.99      0.98      1743
weighted avg       0.98      0.98      0.98      1743



Unnamed: 0,word,svm_spam_probability
2870,statement,1.0
442,voucher,1.0
4164,services,1.0
4960,txting,1.0
7729,attempt,1.0
5800,caller,1.0
1774,phones,1.0
5771,un,1.0
7150,offers,1.0
6459,content,1.0


Our SVM model achive an accuracy of 97.3%. This is likly because we had strongly separable classes. The spam messages often contain highly disinctive vocabulary. We also had a large number of samples to train on, and we had simple but expressive features.

The ham words had a precision of 1 and a recall of 0.97. The ham words were classified almost perfectly, the classifier rarely mislabels spam words as ham.

The spam words had a precision of 0.90 and a recall of 1. The model was able to identify all spam associated words but 10% were false psitives, meaning that some ham associated words are occasionally predicted as spam.

Some of the most spam weighted words were complimentary, camcorder, and subscriber. These words are signature tokens of the SMS Spam Collection dataset. The high probabilities indicate that these words appear exclusively in spam.

Because the row unit is a word and not a message, the SVM is learning which words are most predictive of spam. Words with spam specific patterns include promo phrases such as complimentary, prize, and subscriber. Others incluse price items, billing numbers, codes, and services.