# Recap: Bayesian Statistics
Bayesian statistics is an approach to statistical inference that applies Bayes' Theorem to update the probability estimate for a hypothesis as more evidence or data becomes available. Unlike frequentist statistics, which only considers the data at hand, Bayesian statistics incorporates prior knowledge or beliefs about the world and updates these beliefs as new data is observed.

In [None]:

import os
import math
import pandas as pd
from collections import defaultdict

import numpy as np
import string
import requests
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords

In [2]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kaleb/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Download Dataset and unzip folder

In [23]:
def download_dataset(url, download_path):
    
    response = requests.get(url)

    if response.status_code == 200:
        with open(download_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded the file to {download_path}")
    else:
        print("Failed to download the file")

    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall("../data")
        print("Unzipped the dataset to '../data' folder")

    extracted_files = os.listdir("../data")
    print("Extracted files:", extracted_files)

In [24]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
download_path = "../data/smsspamcollection.zip"

download_dataset(url, download_path)

Downloaded the file to ../data/smsspamcollection.zip
Unzipped the dataset to '../data' folder
Extracted files: ['smsspamcollection.zip', 'readme', 'SMSSpamCollection']


## Load the Dataset

In [28]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
df = pd.read_csv('../data/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

In [29]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Data Preprocessing

In [31]:
# Convert labels to binary values: ham = 0, spam = 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [32]:
# Clean the text (remove punctuation, convert to lowercase, remove stopwords)
def clean_text(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['message'] = df['message'].apply(clean_text)

## Split dataset to training and test

In [33]:
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Convert text to numerical features using CountVectorizer


In [34]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [35]:

# Step 4: Tokenize the messages and create vocabulary
def tokenize(messages):
    word_counts = defaultdict(int)
    for message in messages:
        for word in message.split():
            word_counts[word] += 1
    return word_counts


# Step 5: Calculate Prior Probabilities P(spam) and P(ham)
def calculate_priors(y_train):
    spam_count = sum(y_train)
    ham_count = len(y_train) - spam_count
    total_count = len(y_train)
    p_spam = spam_count / total_count
    p_ham = ham_count / total_count
    return p_spam, p_ham

# Step 6: Calculate Likelihoods P(word|spam) and P(word|ham)
def calculate_likelihoods(X_train, y_train, vocab):
    spam_word_counts = defaultdict(int)
    ham_word_counts = defaultdict(int)
    spam_count = 0
    ham_count = 0

    for message, label in zip(X_train, y_train):
        words = message.split()
        if label == 1:  # Spam
            spam_count += 1
            for word in words:
                spam_word_counts[word] += 1
        else:  # Ham
            ham_count += 1
            for word in words:
                ham_word_counts[word] += 1

    # Apply Laplace smoothing
    vocab_size = len(vocab)
    p_word_given_spam = {}
    p_word_given_ham = {}

    for word in vocab:
        p_word_given_spam[word] = (spam_word_counts[word] + 1) / (spam_count + vocab_size)
        p_word_given_ham[word] = (ham_word_counts[word] + 1) / (ham_count + vocab_size)

    return p_word_given_spam, p_word_given_ham

# Step 7: Predict the class for a new message
def predict(message, p_spam, p_ham, p_word_given_spam, p_word_given_ham, vocab):
    p_spam_given_message = math.log(p_spam)
    p_ham_given_message = math.log(p_ham)

    words = message.split()
    for word in words:
        if word in vocab:
            p_spam_given_message += math.log(p_word_given_spam.get(word, 1 / (len(vocab) + 1)))
            p_ham_given_message += math.log(p_word_given_ham.get(word, 1 / (len(vocab) + 1)))

    if p_spam_given_message > p_ham_given_message:
        return 1  # Spam
    else:
        return 0  # Ham

# Step 8: Evaluate the model
def evaluate(X_test, y_test, p_spam, p_ham, p_word_given_spam, p_word_given_ham, vocab):
    correct = 0
    for message, true_label in zip(X_test, y_test):
        predicted_label = predict(message, p_spam, p_ham, p_word_given_spam, p_word_given_ham, vocab)
        if predicted_label == true_label:
            correct += 1
    accuracy = correct / len(y_test)
    return accuracy

In [36]:
# Tokenize the training messages and create a vocabulary
vocab = tokenize(X_train)

# Calculate prior probabilities
p_spam, p_ham = calculate_priors(y_train)

# Calculate likelihoods for each word in the vocabulary
p_word_given_spam, p_word_given_ham = calculate_likelihoods(X_train, y_train, vocab)

# Evaluate the model
accuracy = evaluate(X_test, y_test, p_spam, p_ham, p_word_given_spam, p_word_given_ham, vocab)

print(f"Accuracy: {accuracy:.4f}")



Accuracy: 0.9821
