In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [9]:
# Load the dataset
# You may need to download NLTK data first
# nltk.download('stopwords')
# nltk.download('punkt')

try:
    file_path = 'data_files/spam.csv'
    df = pd.read_csv(file_path, encoding='latin-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found!!")
    # You could add code here to exit or handle the error


In [11]:
# Data cleaning: Drop unnecessary columns and rename
if 'df' in locals():
    df = df[['v1', 'v2']]
    df.columns = ['label', 'message']
    
    print(df.head())
    print("\nDataset shape:", df.shape)
    print("\nSpam vs. Ham counts:")
    print(df['label'].value_counts())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Dataset shape: (5572, 2)

Spam vs. Ham counts:
label
ham     4825
spam     747
Name: count, dtype: int64


##### Preprocess the Text

In [14]:
# Initialize stemmer and stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [16]:
def preprocess_text(text):
    """
    Cleans and preprocesses text data.
    1. Lowercase
    2. Remove punctuation
    3. Remove stop words
    4. Apply stemming
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # 3. Tokenize (split into words) and remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # 4. Apply stemming
    words = [stemmer.stem(word) for word in words]
    
    # Join words back into a single string
    return ' '.join(words)

In [18]:
# Apply the preprocessing to the message column
if 'df' in locals():
    print("\nPreprocessing text... (This may take a moment)")
    df['cleaned_message'] = df['message'].apply(preprocess_text)
    print("Preprocessing complete.")
    print(df[['message', 'cleaned_message']].head())


Preprocessing text... (This may take a moment)
Preprocessing complete.
                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri 2 wkli comp win fa cup final tkt 21...  
3                u dun say earli hor u c alreadi say  
4          nah dont think goe usf live around though  


##### Map Labels and Split Data

In [21]:
if 'df' in locals():
    # Map labels to numbers
    df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
    
    # Define our features (X) and target (y)
    X = df['cleaned_message']
    y = df['label_num']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    print(f"\nTraining data shape: {X_train.shape}")
    print(f"Testing data shape: {X_test.shape}")


Training data shape: (4457,)
Testing data shape: (1115,)


##### Vectorization (Bag-of-Words vs. TF-IDF)

In [24]:
if 'df' in locals():
    # 1. Initialize CountVectorizer
    bow_vectorizer = CountVectorizer()
    
    # 2. Fit and transform the training data
    X_train_bow = bow_vectorizer.fit_transform(X_train)
    
    # 3. Only transform the testing data (using the vocab from training)
    X_test_bow = bow_vectorizer.transform(X_test)
    
    print(f"\nBoW Vectorized Training Data Shape: {X_train_bow.shape}")


BoW Vectorized Training Data Shape: (4457, 7135)


In [26]:
if 'df' in locals():
    # 1. Initialize TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # 2. Fit and transform the training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    
    # 3. Only transform the testing data
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    print(f"TF-IDF Vectorized Training Data Shape: {X_train_tfidf.shape}")

TF-IDF Vectorized Training Data Shape: (4457, 7135)


##### Train and Evaluate Models

In [29]:
def train_and_evaluate(X_train, y_train, X_test, y_test, method_name):
    """
    Trains a Multinomial Naive Bayes model and prints its evaluation.
    """
    print(f"\n--- Model Evaluation for: {method_name} ---")
    
    # 1. Train the model
    model = MultinomialNB()
    model.fit(X_train, y_train)
    
    # 2. Make predictions
    y_pred = model.predict(X_test)
    
    # 3. Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, target_names=['Ham (0)', 'Spam (1)'])
    
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)
    
    return model

In [31]:
if 'df' in locals():
    # Train and evaluate BoW model
    model_bow = train_and_evaluate(X_train_bow, y_train, X_test_bow, y_test, "Bag-of-Words (BoW)")
    
    # Train and evaluate TF-IDF model
    model_tfidf = train_and_evaluate(X_train_tfidf, y_train, X_test_tfidf, y_test, "TF-IDF")


--- Model Evaluation for: Bag-of-Words (BoW) ---
Accuracy: 0.9830

Confusion Matrix:
[[962   4]
 [ 15 134]]

Classification Report:
              precision    recall  f1-score   support

     Ham (0)       0.98      1.00      0.99       966
    Spam (1)       0.97      0.90      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


--- Model Evaluation for: TF-IDF ---
Accuracy: 0.9623

Confusion Matrix:
[[965   1]
 [ 41 108]]

Classification Report:
              precision    recall  f1-score   support

     Ham (0)       0.96      1.00      0.98       966
    Spam (1)       0.99      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



Confusion Matrix: Shows (Ham, Spam) x (Predicted Ham, Predicted Spam). Look at the false positives (predicted Spam, but was Ham) and false negatives (predicted Ham, but was Spam).

Precision (Spam): Of all messages we labeled as spam, what percentage was actually spam? (High precision is good = we don't annoy users by flagging good mail as spam).

Recall (Spam): Of all actual spam messages, what percentage did we catch? (High recall is good = we protect users from spam).

F1-Score: The harmonic mean of Precision and Recall. A good single metric for an imbalanced dataset like this.

##### Test Your Model

In [36]:
def predict_spam(message, vectorizer, model):
    """
    Predicts if a single new message is spam or ham.
    """
    # 1. Preprocess the message
    cleaned_message = preprocess_text(message)
    
    # 2. Vectorize the message
    message_vec = vectorizer.transform([cleaned_message])
    
    # 3. Make a prediction
    prediction = model.predict(message_vec)
    
    # 4. Return the result
    return "Spam" if prediction[0] == 1 else "Ham"

In [38]:
# --- Example Usage (run this after training 'model_tfidf') ---
if 'model_tfidf' in locals():
    test_msg_1 = "Congratulations! You've won a $1000 gift card. Click here to claim: www.fake.com"
    test_msg_2 = "Hey, are you free for dinner tonight at 7?"
    
    print(f"\nTesting new messages with TF-IDF model:")
    print(f"Message: '{test_msg_1}' \nPrediction: {predict_spam(test_msg_1, tfidf_vectorizer, model_tfidf)}")
    print(f"\nMessage: '{test_msg_2}' \nPrediction: {predict_spam(test_msg_2, tfidf_vectorizer, model_tfidf)}")


Testing new messages with TF-IDF model:
Message: 'Congratulations! You've won a $1000 gift card. Click here to claim: www.fake.com' 
Prediction: Spam

Message: 'Hey, are you free for dinner tonight at 7?' 
Prediction: Ham
