Develop a classification model to automatically detect and filter out spam emails from a user's inbox.

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
# Download necessary NLTK data
nltk.download('stopwords')
# Step 1: Data Collection and Preprocessing
def preprocess_text(text):
# Convert to lowercase
  text = text.lower()
# Remove punctuation and special characters
  text = re.sub(r'[^\w\s]', '', text)
# Remove numbers
  text = re.sub(r'\d+', '', text)
# Remove stopwords
  stop_words = set(stopwords.words('english'))
  words = text.split()
  words = [word for word in words if word not in stop_words]
# Stemming
  stemmer = PorterStemmer()
  words = [stemmer.stem(word) for word in words]
  return ' '.join(words)
# Load the dataset (assuming we have a CSV file with 'text' and 'label' columns)
data = pd.read_csv('/spam_ham_dataset.csv')
data['processed_text'] = data['text'].apply(preprocess_text)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['processed_text'], data['label'], test_size=0.2, random_state=42)
# Step 2: Feature Extraction
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
# Step 3: Model Selection and Training
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)
# Step 4: Model Evaluation
y_pred = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
# Step 5: Model Deployment and Use
def classify_email(email_text, classifier, vectorizer):
  processed_email = preprocess_text(email_text)
  vectorized_email = vectorizer.transform([processed_email])
  prediction = classifier.predict(vectorized_email)
  spam_prob = classifier.predict_proba(vectorized_email)[0][1] # Probability of being spam
  return prediction[0], spam_prob
# Example usage
new_email = "Get rich quick! Buy our amazing product now!"
prediction, spam_probability = classify_email(new_email, classifier, vectorizer)
print(f"Email classification: {'Spam' if prediction == 'spam' else 'Not Spam'}")
print(f"Spam probability: {spam_probability:.2f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.97
Precision: 0.95
Recall: 0.95
F1-score: 0.95
Confusion Matrix:
[[728  14]
 [ 14 279]]
Email classification: Spam
Spam probability: 0.93
