# 1.5 Assignment — Bag of Words

# Will import required libraries

In [66]:
import re
import string
from pathlib import Path

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Used to load the dataset

In [69]:
# Relative path
relative_path = Path("data") / "ham-spam.csv"

# Absolute path (If needed)
absolute_path = Path(r"C:\Users\MasterDanteDev86\Downloads\CAP355-O Natural Language Processing - Online\W1\NLP\data\ham-spam.csv")

if relative_path.exists():
    data_path = relative_path
else:
    data_path = absolute_path

df = pd.read_csv(data_path)
print("Loaded from:", data_path)
print("Shape:", df.shape)
df.head()

Loaded from: C:\Users\MasterDanteDev86\Downloads\CAP355-O Natural Language Processing - Online\W1\NLP\data\ham-spam.csv
Shape: (5572, 2)


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Will inspect dataset columns

In [72]:
print("Columns:", list(df.columns))

# Will detect label/text columns
lower_cols = [c.lower() for c in df.columns]

label_col = None
text_col = None

# The common guesses
for c in df.columns:
    cl = c.lower()
    if label_col is None and cl in ["label", "category", "type", "class"]:
        label_col = c
    if text_col is None and cl in ["message", "text", "sms", "body", "content"]:
        text_col = c

# "Not Found, will fall back to first two columns
if label_col is None or text_col is None:
    if df.shape[1] >= 2:
        label_col = df.columns[0]
        text_col = df.columns[1]

df = df[[label_col, text_col]].copy()
df.columns = ["label", "text"]

df["label"] = df["label"].astype(str).str.lower().str.strip()
df["text"] = df["text"].astype(str)

df.head()

Columns: ['label', 'text']


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# The class distribution and examples

In [75]:
print("Class counts:")
print(df["label"].value_counts())

print("\nA few example rows:")
df.sample(5, random_state=42)

Class counts:
label
ham     4825
spam     747
Name: count, dtype: int64

A few example rows:


Unnamed: 0,label,text
3245,ham,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
944,ham,I sent my scores to sophas and i had to do sec...
1044,spam,We know someone who you know that fancies you....
2484,ham,Only if you promise your getting out as SOON a...
812,spam,Congratulations ur awarded either å£500 of CD ...


# Used to create text cleaning function

The function:

1. Will convert text to lowercase

2. Will remove punctuation

3. Will remove stopwords

The function will be applied to every message

In [78]:
stop_words = set(ENGLISH_STOP_WORDS)

def clean_text(message):
    # Lowercase
    msg = message.lower()

    # Removes punctuation
    msg = msg.translate(str.maketrans("", "", string.punctuation))

    # Will remove the extra whitespace, and will keep only: (words/numbers/spaces)
    msg = re.sub(r"\s+", " ", msg).strip()

    # Removes stopwords
    words = msg.split()
    kept = []
    for w in words:
        if w not in stop_words:
            kept.append(w)

    return " ".join(kept)

# Will apply cleaning to dataset

In [81]:
df["clean_text"] = df["text"].apply(clean_text)

# This will show the before and after for a few rows
mini = df.sample(5, random_state=7)[["label", "text", "clean_text"]]
mini

Unnamed: 0,label,text,clean_text
83,ham,You will be in the place of that man,place man
2235,ham,\Si.como no?!listened2the plaid album-quite gd...,sicomo nolistened2the plaid albumquite gdthe n...
2746,ham,K da:)how many page you want?,k dahow page want
246,ham,I asked you to call him now ok,asked ok
3120,ham,Small problem in auction:)punj now asking tiwary,small problem auctionpunj asking tiwary


# Convert text to Bag-of-Words features (CountVectorizer)

CountVectorizer to turn the cleaned messages into a feature matrix

In [84]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

print("Feature matrix shape:", X.shape)
print("Example vocabulary size:", len(vectorizer.vocabulary_))

Feature matrix shape: (5572, 9229)
Example vocabulary size: 9229


# Train and test split the data

In [87]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 4457
Test size: 1115


# Train (MultinomialNB)

In [90]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)

nb_acc = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_acc)

Naive Bayes Accuracy: 0.9730941704035875


# Evaluatioin with training model (Naive Bayes)

We can see the following:

1. The Accuracy

2. The confusion matrix

3. The classification report

In [93]:
print("Confusion Matrix (Naive Bayes):")
print(confusion_matrix(y_test, nb_pred))

print("\nClassification Report (Naive Bayes):")
print(classification_report(y_test, nb_pred))

Confusion Matrix (Naive Bayes):
[[946  20]
 [ 10 139]]

Classification Report (Naive Bayes):
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       966
        spam       0.87      0.93      0.90       149

    accuracy                           0.97      1115
   macro avg       0.93      0.96      0.94      1115
weighted avg       0.97      0.97      0.97      1115



# Testing with 3 of my own messages

Will predict my custom messages

In [96]:
my_messages = [
    "Good Evening, are we still meeting for the class lecture tomorrow at 7:15am?",
    "CONGRATS!! You won $1 million in cash with Publishers Clearing House. Click the link to claim your prize now!!!",
    "Reminder: your vehicle appointment is scheduled for Saturday at 7am with Hyundai Service. Reply YES to confirm."
]

# Clean, Vectorize, Predict
my_clean = [clean_text(m) for m in my_messages]
my_X = vectorizer.transform(my_clean)

# Use the NB model
preds = nb_model.predict(my_X)

probs = None
if hasattr(nb_model, "predict_proba"):
    probs = nb_model.predict_proba(my_X)

for i in range(len(my_messages)):
    print("\nMessage:", my_messages[i])
    print("Cleaned :", my_clean[i])
    print("Prediction:", preds[i])

    if probs is not None:
        class_list = list(nb_model.classes_)
        prob_list = probs[i].tolist()
        prob_map = dict(zip(class_list, prob_list))
        print("Probabilities:", prob_map)


Message: Good Evening, are we still meeting for the class lecture tomorrow at 7:15am?
Cleaned : good evening meeting class lecture tomorrow 715am
Prediction: ham
Probabilities: {'ham': 0.9999994471820166, 'spam': 5.528179854261926e-07}

Message: CONGRATS!! You won $1 million in cash with Publishers Clearing House. Click the link to claim your prize now!!!
Cleaned : congrats won 1 million cash publishers clearing house click link claim prize
Prediction: spam
Probabilities: {'ham': 4.4760394749083895e-08, 'spam': 0.999999955239603}

Message: Reminder: your vehicle appointment is scheduled for Saturday at 7am with Hyundai Service. Reply YES to confirm.
Cleaned : reminder vehicle appointment scheduled saturday 7am hyundai service reply yes confirm
Prediction: spam
Probabilities: {'ham': 0.09533544497911167, 'spam': 0.9046645550208934}


# Reflection

In [99]:
# 1) How well did your model perform?
# The model got about 97% accuracy on the test set, which feels pretty good for a first
# spam classifier using "Bag of Words". The classification report also shows strong precision
# and recall, so it does a good job telling ham and spam apart from each other.

# 2) Were any predictions surprising?
# Yes. Some words that usually appear in spam can also come up in normal messages,
# which can confuse the model it seemms sometimes. Also, short messages are harder to classify
# because there is less text to analyze.

# 3) What might help improve the model in the future?
# - Use TF-IDF(Term Frequency–Inverse Document Frequency: looks at how important that word is!) instead of raw word counts, so very common words matter less.
# - Try n-grams(lets the model look at small word phrases) to capture short phrases instead of just single words.
# - Tuning parameters such as min_df and max_df to reduce noise.
# - Use cross validation, so the score is not based on just one train/test split.