In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
dataset_url = "https://raw.githubusercontent.com/datasets/spam-dataset/main/spam.csv"
data = pd.read_csv(dataset_url, encoding='latin-1')

In [None]:
print(data.head())
print(data.info())

# Rename columns for clarity (if needed)
data = data.rename(columns={"v1": "label", "v2": "email_content"})
data = data[["label", "email_content"]]


In [None]:
data['label'] = data['label'].map({"ham": 0, "spam": 1})


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(text)

In [None]:
data['processed_content'] = data['email_content'].apply(preprocess_text)

# Check preprocessed data
print(data['processed_content'].head())

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Transform text data to feature matrix
X = tfidf.fit_transform(data['processed_content']).toarray()

In [None]:
y = data['label']

# **5. Train-Test Split**
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **6. Model Training**
# Initialize Naive Bayes classifier
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# **7. Model Evaluation**
# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# **8. Save and Load Model (Optional)**
import pickle

# Save the trained model
with open('spam_detector_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

# Load the trained model
with open('spam_detector_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_tfidf = pickle.load(file)

# **9. Real-Time Testing**
# Function to classify new emails
def classify_email(email_text):
    processed_email = preprocess_text(email_text)
    email_features = loaded_tfidf.transform([processed_email]).toarray()
    prediction = loaded_model.predict(email_features)
    return "Spam" if prediction[0] == 1 else "Not Spam"

# Test the classifier
sample_email = "Congratulations! You have won a free iPhone. Claim now!"
print(f"Email: {sample_email}\nClassification: {classify_email(sample_email)}")
