In [1]:
# Email Spam Classification - Week 03 Task

# Section 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')

# Section 2: Load Dataset
# Replace with your dataset path
data = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
data.columns = ['label', 'message']

# Section 3: Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    return " ".join(text)

data['clean_message'] = data['message'].apply(clean_text)
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})

# Section 4: Vectorization
cv = CountVectorizer()
x_count = cv.fit_transform(data['clean_message'])

tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(data['clean_message'])

y = data['label_num']

# Section 5: Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.2, random_state=42)

# Section 6: Model Building & Evaluation
def evaluate_model(model, name):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("Classification Report:\n", classification_report(y_test, preds))
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    print("\n")

# Multinomial Naive Bayes
evaluate_model(MultinomialNB(), "Multinomial Naive Bayes")

# Logistic Regression
evaluate_model(LogisticRegression(max_iter=1000), "Logistic Regression")

# Support Vector Machine
evaluate_model(SVC(), "Support Vector Machine")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaifu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


--- Multinomial Naive Bayes ---
Accuracy: 0.9659192825112107
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
 [[965   0]
 [ 38 112]]


--- Logistic Regression ---
Accuracy: 0.9551569506726457
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.69      0.81       150

    accuracy                           0.96      1115
   macro avg       0.96      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115

Confusion Matrix:
 [[961   4]
 [ 46 104]]


--- Support Vector Machine ---
Accuracy: 0.9775784753363229
Classification Report:
         