## Data Collection

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict


### Load Data

In [15]:
# Load the dataset
data = pd.read_csv('../data/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
data.columns = ['label', 'message']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

## Data Preprocessing

In [16]:
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
print(data.head())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


## Train Test Split

In [18]:
# Split dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Preprocessing
def preprocess_text(text):
    return text.lower().split()

train_data['processed'] = train_data['message'].apply(preprocess_text)
test_data['processed'] = test_data['message'].apply(preprocess_text)

In [19]:
train_data['processed']

1978    [reply, to, win, £100, weekly!, where, will, t...
3989    [hello., sort, of, out, in, town, already., th...
3935    [how, come, guoyang, go, n, tell, her?, then, ...
4078    [hey, sathya, till, now, we, dint, meet, not, ...
4086    [orange, brings, you, ringtones, from, all, ti...
                              ...                        
3772    [hi,, wlcome, back,, did, wonder, if, you, got...
5191                          [sorry,, i'll, call, later]
5226    [prabha..i'm, soryda..realy..frm, heart, i'm, ...
5390                     [nt, joking, seriously, i, told]
860      [did, he, just, say, somebody, is, named, tampa]
Name: processed, Length: 4457, dtype: object

In [20]:
test_data['processed']

3245    [squeeeeeze!!, this, is, christmas, hug.., if,...
944     [and, also, i've, sorta, blown, him, off, a, c...
1044    [mmm, thats, better, now, i, got, a, roast, do...
2484    [mm, have, some, kanji, dont, eat, anything, h...
812     [so, there's, a, ring, that, comes, with, the,...
                              ...                        
4264    [den, only, weekdays, got, special, price..., ...
2439    [i, not, busy, juz, dun, wan, 2, go, so, early...
5556    [yes, i, have., so, that's, why, u, texted., p...
4205    [how, are, you, enjoying, this, semester?, tak...
4293                                              [g.w.r]
Name: processed, Length: 1115, dtype: object

In [21]:
# Vocabulary and word counts
spam_word_counts = defaultdict(int)
ham_word_counts = defaultdict(int)
spam_total = ham_total = 0

for _, row in train_data.iterrows():
    for word in row['processed']:
        if row['label'] == 1:  # Spam
            spam_word_counts[word] += 1
            spam_total += 1
        else:  # Ham
            ham_word_counts[word] += 1
            ham_total += 1

# Prior probabilities
P_spam = len(train_data[train_data['label'] == 1]) / len(train_data)
P_ham = 1 - P_spam


In [22]:
# Prior probabilities
P_spam = len(train_data[train_data['label'] == 1]) / len(train_data)
P_ham = 1 - P_spam
print(f"Priors:\nP(spam) = {P_spam:.4f}\nP(ham) = {P_ham:.4f}\n")

Priors:
P(spam) = 0.1342
P(ham) = 0.8658



In [23]:
# Laplace smoothing
vocab = set(spam_word_counts.keys()).union(set(ham_word_counts.keys()))
vocab_size = len(vocab)

def calculate_word_probability(word, label):
    if label == 1:  # Spam
        likelihood = (spam_word_counts[word] + 1) / (spam_total + vocab_size)
    else:  # Ham
        likelihood = (ham_word_counts[word] + 1) / (ham_total + vocab_size)
    print(f"Likelihood of '{word}' given {'spam' if label == 1 else 'ham'}: {likelihood:.6f}")
    return likelihood

In [24]:
# Predict
def predict(message):
    words = preprocess_text(message)
    spam_prob = np.log(P_spam)
    ham_prob = np.log(P_ham)
    
    for word in words:
        spam_prob += np.log(calculate_word_probability(word, 1))
        ham_prob += np.log(calculate_word_probability(word, 0))
    
    posterior_spam = np.exp(spam_prob) / (np.exp(spam_prob) + np.exp(ham_prob))
    posterior_ham = np.exp(ham_prob) / (np.exp(spam_prob) + np.exp(ham_prob))
    
    print(f"Posterior probabilities for message '{message}':")
    print(f"P(spam|message) = {posterior_spam:.6f}")
    print(f"P(ham|message) = {posterior_ham:.6f}\n")
    
    return 1 if spam_prob > ham_prob else 0


In [25]:
# Evaluate
test_data['predicted'] = test_data['message'].apply(predict)

# Calculate Metrics
accuracy = (test_data['label'] == test_data['predicted']).mean()
precision = precision_score(test_data['label'], test_data['predicted'])
recall = recall_score(test_data['label'], test_data['predicted'])
f1 = f1_score(test_data['label'], test_data['predicted'])

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Likelihood of 'squeeeeeze!!' given spam: 0.000038
Likelihood of 'squeeeeeze!!' given ham: 0.000015
Likelihood of 'this' given spam: 0.002528
Likelihood of 'this' given ham: 0.002744
Likelihood of 'is' given spam: 0.004367
Likelihood of 'is' given ham: 0.008308
Likelihood of 'christmas' given spam: 0.000077
Likelihood of 'christmas' given ham: 0.000075
Likelihood of 'hug..' given spam: 0.000038
Likelihood of 'hug..' given ham: 0.000015
Likelihood of 'if' given spam: 0.001149
Likelihood of 'if' given ham: 0.004131
Likelihood of 'u' given spam: 0.003716
Likelihood of 'u' given ham: 0.010500
Likelihood of 'lik' given spam: 0.000038
Likelihood of 'lik' given ham: 0.000075
Likelihood of 'my' given spam: 0.000345
Likelihood of 'my' given ham: 0.008785
Likelihood of 'frndshp' given spam: 0.000038
Likelihood of 'frndshp' given ham: 0.000015
Likelihood of 'den' given spam: 0.000038
Likelihood of 'den' given ham: 0.000358
Likelihood of 'hug' given spam: 0.000038
Likelihood of 'hug' given ham: 0.0

  posterior_spam = np.exp(spam_prob) / (np.exp(spam_prob) + np.exp(ham_prob))
  posterior_ham = np.exp(ham_prob) / (np.exp(spam_prob) + np.exp(ham_prob))
