In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string
from nltk.stem import PorterStemmer
import re
import time


In [2]:
# Load the data from the CSV file
data = pd.read_csv("spamEmail.csv")
# Print the first few rows of the data
print(data.head())

num_features = data.shape[1]
print(f"Number of features in the dataset: {num_features}")


                                    email_content label
0        Hello, I hope you're having a great day!   ham
1    Win a free iPhone now by clicking this link!  spam
2        Meeting scheduled for tomorrow at 10 AM.   ham
3  Congratulations! You've won a $1000 gift card.  spam
4    Are you available for a call this afternoon?   ham
Number of features in the dataset: 2


In [3]:
print(data.isnull().sum())  # This will print the count of missing values per column

# Pre-processing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[^{string.printable}]", "", text)
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

data["email_content"] = data["email_content"].apply(preprocess_text)

# Check for missing values
print("Missing values summary:")
print(data.isnull().sum())  # This will print the count of missing values per column


email_content    0
label            0
dtype: int64
Missing values summary:
email_content    0
label            0
dtype: int64


In [4]:
# Split data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(data["email_content"], data["label"], test_size=0.3)


In [5]:
# Feature Engineering (TF-IDF Vectorization)
vectorizer = TfidfVectorizer(max_features=2000, sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [7]:
# Feature Selection using SelectKBest with chi2
selector = SelectKBest(chi2, k=150)  
X_train_features = selector.fit_transform(X_train_tfidf, y_train)
X_test_features = selector.transform(X_test_tfidf)

In [19]:
# Train a model (Multinomial Naive Bayes)
start_time = time.time()
model_nb = MultinomialNB()
model_nb.fit(X_train_features, y_train)

# Predict on test data
y_pred_nb = model_nb.predict(X_test_features)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
error_rate_nb = 1 - accuracy_nb
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)
classification_report_nb = classification_report(y_test, y_pred_nb)
execution_time_nb = time.time() - start_time

print(f"Multinomial Naive Bayes Accuracy: {accuracy_nb}")
print(f"Multinomial Naive Bayes Error Rate: {error_rate_nb}")
print(f"Multinomial Naive Bayes Execution Time: {execution_time_nb} seconds")
print("Multinomial Naive Bayes Classification Report:")
print(classification_report_nb)
print("Multinomial Naive Bayes Confusion Matrix:")
print(conf_matrix_nb)


Naive Bayes Accuracy: 0.9285714285714286
Naive Bayes Error Rate: 0.0714285714285714
Naive Bayes Execution Time: 0.024150609970092773 seconds
Naive Bayes Classification Report:
              precision    recall  f1-score   support

         ham       0.93      0.93      0.93        15
        spam       0.92      0.92      0.92        13

    accuracy                           0.93        28
   macro avg       0.93      0.93      0.93        28
weighted avg       0.93      0.93      0.93        28

Naive Bayes Confusion Matrix:
[[14  1]
 [ 1 12]]


In [20]:
# Train a model (Gaussian Naive Bayes)
start_time = time.time()
model_gnb = GaussianNB()
model_gnb.fit(X_train_features.toarray(), y_train)

# Predict on test data
y_pred_gnb = model_gnb.predict(X_test_features.toarray())

# Evaluate the model
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
error_rate_gnb = 1 - accuracy_gnb
conf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)
classification_report_gnb = classification_report(y_test, y_pred_gnb)
execution_time_gnb = time.time() - start_time

print(f"Gaussian Naive Bayes Accuracy: {accuracy_gnb}")
print(f"Gaussian Naive Bayes Error Rate: {error_rate_gnb}")
print(f"Gaussian Naive Bayes Execution Time: {execution_time_gnb} seconds")
print("Gaussian Naive Bayes Classification Report:")
print(classification_report_gnb)
print("Gaussian Naive Bayes Confusion Matrix:")
print(conf_matrix_gnb)


Gaussian Naive Bayes Accuracy: 0.9642857142857143
Gaussian Naive Bayes Error Rate: 0.0357142857142857
Gaussian Naive Bayes Execution Time: 0.02864980697631836 seconds
Gaussian Naive Bayes Classification Report:
              precision    recall  f1-score   support

         ham       0.94      1.00      0.97        15
        spam       1.00      0.92      0.96        13

    accuracy                           0.96        28
   macro avg       0.97      0.96      0.96        28
weighted avg       0.97      0.96      0.96        28

Gaussian Naive Bayes Confusion Matrix:
[[15  0]
 [ 1 12]]


In [21]:
# Train a model (Decision Tree)
start_time = time.time()
model_j48 = DecisionTreeClassifier(random_state=42)
model_j48.fit(X_train_features, y_train)

# Predict on test data
y_pred_j48 = model_j48.predict(X_test_features)

# Evaluate the model
accuracy_j48 = accuracy_score(y_test, y_pred_j48)
error_rate_j48 = 1 - accuracy_j48
conf_matrix_j48 = confusion_matrix(y_test, y_pred_j48)
classification_report_j48 = classification_report(y_test, y_pred_j48)
execution_time_j48 = time.time() - start_time

print(f"Decision Tree (J48) Accuracy: {accuracy_j48}")
print(f"Decision Tree (J48) Error Rate: {error_rate_j48}")
print(f"Decision Tree (J48) Execution Time: {execution_time_j48} seconds")
print("Decision Tree (J48) Classification Report:")
print(classification_report_j48)
print("Decision Tree (J48) Confusion Matrix:")
print(conf_matrix_j48)


Decision Tree (J48) Accuracy: 0.8928571428571429
Decision Tree (J48) Error Rate: 0.1071428571428571
Decision Tree (J48) Execution Time: 0.02892780303955078 seconds
Decision Tree (J48) Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.80      0.89        15
        spam       0.81      1.00      0.90        13

    accuracy                           0.89        28
   macro avg       0.91      0.90      0.89        28
weighted avg       0.91      0.89      0.89        28

Decision Tree (J48) Confusion Matrix:
[[12  3]
 [ 0 13]]


In [22]:
results = pd.DataFrame({
    "Algorithm": ["Multinomial Naive Bayes", "Gaussian Naive Bayes", "Decision Tree (J48)"],
    "Accuracy": [accuracy_nb, accuracy_gnb, accuracy_j48],
    "Error Rate": [error_rate_nb, error_rate_gnb, error_rate_j48],
    "Execution Time (s)": [execution_time_nb, execution_time_gnb, execution_time_j48]
})

print(results)


              Algorithm  Accuracy  Error Rate  Execution Time (s)
0           Naive Bayes  0.928571    0.071429            0.024151
1  Gaussian Naive Bayes  0.964286    0.035714            0.028650
2   Decision Tree (J48)  0.892857    0.107143            0.028928
