In [1]:
import pandas as pd

# Load the spam email classification dataset
# Replace 'path/to/spam_dataset.csv' with the actual path to your dataset
df = pd.read_csv('emails.csv')

# Display the first few rows of the dataset
print(df.head())


                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['spam'], test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training text data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing text data
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Train the classifier
clf.fit(X_train_tfidf, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display additional metrics
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       0.99      0.93      0.96       290

    accuracy                           0.98      1146
   macro avg       0.98      0.96      0.97      1146
weighted avg       0.98      0.98      0.98      1146


Confusion Matrix:
[[852   4]
 [ 21 269]]


In [5]:
# Example of predicting a new email
new_email = ["Get rich quick! Send money now!"]
new_email_tfidf = tfidf_vectorizer.transform(new_email)

prediction = clf.predict(new_email_tfidf)
print(f'Predicted Label: {prediction[0]}')

Predicted Label: 1
