In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Load dataset (SMS Spam Collection dataset)
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms-spam-collection.csv"
df = pd.read_csv(url, names=["label", "message"], sep="\t")

# Step 3: Convert labels (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 4: Text preprocessing (Convert text to numerical features)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df["message"])
y = df["label"]

# Step 5: Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a Naïve Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 7: Predict on test data
y_pred = model.predict(X_test)

# Step 8: Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# Step 9: Testing with new data
sample_messages = ["You have won $5000! Claim now!", "Hey, are we meeting today?"]
sample_vectors = vectorizer.transform(sample_messages)
predictions = model.predict(sample_vectors)

# Display predictions
for msg, pred in zip(sample_messages, predictions):
    label = "Spam" if pred == 1 else "Not Spam"
    print(f"Message: {msg} --> Prediction: {label}")
