# Assignment 1: Implement a spam email classifier using two variants of Naïve Bayes models: Naïve Bayes (Multinomial Naïve Bayes) and Gaussian Naïve Bayes

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import nltk
import string
# only run once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mcanthony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mcanthony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mcanthony/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## 2. Data Preprocessing
#### Step 1.1: Load the dataset


In [2]:
spam_df = pd.read_csv('../datasets/spam_ham_dataset.csv')
# drop column unnamed:0
spam_df.drop(columns='Unnamed: 0', inplace=True)
# inspect the first few rows
spam_df.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\nth...,0
1,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0
2,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\nthis deal is to ...,0


#### Step 1.2: Handle missing Values

In [3]:
# check missing values
print(f'There are {spam_df.isnull().sum().sum()} missing values')
# check for duplicates
print(f'There are {spam_df.duplicated().sum()} duplicates')
# remove the duplicates
spam_df.drop_duplicates(keep='first',inplace=True)

There are 0 missing values
There are 178 duplicates


#### Step 1.3: Split the Dataset

In [4]:
X_text = spam_df[['text']]
y_target = spam_df[['label_num']]

#### Step 1.4: Text Preprocessing

In [5]:
# add features
X_text = X_text.copy()
X_text['length'] = X_text['text'].str.len()
X_text['sent_count'] = X_text['text'].apply(sent_tokenize).apply(len)
X_text['word_count'] = X_text['text'].apply(word_tokenize).apply(len)

In [6]:
X_text.head()

Unnamed: 0,text,length,sent_count,word_count
0,Subject: enron methanol ; meter # : 988291\nth...,322,3,68
1,"Subject: hpl nom for january 9 , 2001\n( see a...",95,3,24
2,"Subject: neon retreat\nho ho ho , we ' re arou...",2515,24,551
3,"Subject: photoshop , windows , office . cheap ...",404,3,49
4,Subject: re : indian springs\nthis deal is to ...,332,3,71


In [7]:
# convert text to lower case
X_text['text_pre'] = X_text['text'].str.lower().apply(word_tokenize)
# remove stop words
stop_words = set(stopwords.words('english'))
# function to remove stopwords: common english words such as "is","the"...etc
def remove_stop_words(col_name, stop_words):
    filtered = []
    for word in col_name:
        if word not in stop_words and word not in string.punctuation:
            filtered.append(word)
    return filtered
X_text['text_pre'] = X_text['text_pre'].apply(lambda x: remove_stop_words(x,stop_words))

In [8]:
X_text.head()

Unnamed: 0,text,length,sent_count,word_count,text_pre
0,Subject: enron methanol ; meter # : 988291\nth...,322,3,68,"[subject, enron, methanol, meter, 988291, foll..."
1,"Subject: hpl nom for january 9 , 2001\n( see a...",95,3,24,"[subject, hpl, nom, january, 9, 2001, see, att..."
2,"Subject: neon retreat\nho ho ho , we ' re arou...",2515,24,551,"[subject, neon, retreat, ho, ho, ho, around, w..."
3,"Subject: photoshop , windows , office . cheap ...",404,3,49,"[subject, photoshop, windows, office, cheap, m..."
4,Subject: re : indian springs\nthis deal is to ...,332,3,71,"[subject, indian, springs, deal, book, teco, p..."


In [9]:
# convert text to numerical values, we are using the converted vectors as our feature
vectorizer = TfidfVectorizer(lowercase=True)
# vectorze and convert the sparse matrix to a numpy array
X_vec = vectorizer.fit_transform(X_text['text']).toarray()
y = y_target['label_num']

## 3. Modeling

#### Step 3.1: Split the data into training and testing sets

In [10]:
# Split into Training and Test Data
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=26)

# Print the size of the training and test set
print(f"Training Set Size: {X_train.shape}")
print(f"Test Set Size: {X_test.shape}")

Training Set Size: (3994, 50447)
Test Set Size: (999, 50447)


#### Step 3.2: Train the model using Multinomial Naïve Bayes

In [11]:
# Initialize the Multinomial Naive Bayes Model
nb_model = MultinomialNB()

# Train model
nb_model.fit(X_train, y_train)

#### Step 3.3: Make Predictions

In [12]:
y_pred = nb_model.predict(X_test)

# Print the predicted and actual labels
print("\nPredictions:", y_pred)
print("Actual Labels:", y_test.values)


Predictions: [0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0
 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 

#### Step 3.4: Train the model using Gaussian Naïve Bayes Model

In [13]:
# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model
gnb.fit(X_train, y_train)

# Print the model parameters and the variance
print("Model Parameters (Mean and Variance):")
print(f"Class 0 (Not Spam) - Mean: {gnb.theta_[0]}, Variance: {gnb.var_[0]}")
print(f"Class 1 (Spam) - Mean: {gnb.theta_[1]}, Variance: {gnb.var_[1]}")

Model Parameters (Mean and Variance):
Class 0 (Not Spam) - Mean: [1.85084591e-02 3.05391500e-02 5.72721356e-05 ... 0.00000000e+00
 0.00000000e+00 0.00000000e+00], Variance: [2.80493903e-03 6.59191134e-03 4.53773557e-06 ... 1.79777276e-11
 1.79777276e-11 1.79777276e-11]
Class 1 (Spam) - Mean: [6.88346055e-03 3.08756574e-03 3.48373925e-04 ... 0.00000000e+00
 2.56973370e-04 2.11926561e-05], Variance: [1.19866654e-03 1.70523950e-04 4.46649785e-05 ... 1.79777276e-11
 7.75915104e-05 5.27744169e-07]
