In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

try:
    data = pd.read_csv('/content/spam.csv', encoding='latin-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please download the dataset and place it in the same directory.")
    exit()

# Drop unnecessary columns and rename for clarity
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

# 2. Pre-processing
# Map 'ham' (not spam) to 0 and 'spam' to 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Separate features (message) and target (label)
X = data['message']
y = data['label']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Feature Extraction (TF-IDF)
# Convert text data into numerical vectors
# TF-IDF (Term Frequency-Inverse Document Frequency) works well by
# giving higher weight to words that are rare and important.
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

# Fit the vectorizer on the training data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# Only transform the test data (using the vocab from training)
X_test_tfidf = vectorizer.transform(X_test)

# 4. Build and Train Model
# Multinomial Naive Bayes is a classic and effective algorithm for text classification
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# 5. Evaluate Model
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"--- Model Evaluation ---")
print(f"Accuracy: {accuracy * 100:.2f}%\n")

# Show a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Show a confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n" + "="*30 + "\n")

# 6. Test with New Predictions
print("--- Test with New Emails ---")

# Create a function to test new emails
def predict_spam(email_text):
    # Transform the new email using the same vectorizer
    email_tfidf = vectorizer.transform([email_text])

    # Make a prediction
    prediction = model.predict(email_tfidf)

    # Return the result
    return "Spam" if prediction[0] == 1 else "Ham (Not Spam)"

# Test examples
email_1 = "Congratulations! You've won a $1,000 Walmart gift card. Go to http://example.com to claim now."
email_2 = "Hey, are we still on for the meeting tomorrow at 2 PM?"
email_3 = "URGENT: Your account has been suspended. Click here to verify your details immediately."

print(f"Email: '{email_1}'")
print(f"Prediction: {predict_spam(email_1)}\n")

print(f"Email: '{email_2}'")
print(f"Prediction: {predict_spam(email_2)}\n")

print(f"Email: '{email_3}'")
print(f"Prediction: {predict_spam(email_3)}\n")

--- Model Evaluation ---
Accuracy: 96.68%

Classification Report:
              precision    recall  f1-score   support

         Ham       0.96      1.00      0.98       965
        Spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
[[965   0]
 [ 37 113]]


--- Test with New Emails ---
Email: 'Congratulations! You've won a $1,000 Walmart gift card. Go to http://example.com to claim now.'
Prediction: Spam

Email: 'Hey, are we still on for the meeting tomorrow at 2 PM?'
Prediction: Ham (Not Spam)

Email: 'URGENT: Your account has been suspended. Click here to verify your details immediately.'
Prediction: Ham (Not Spam)



In [1]:
%pip install pandas scikit-learn

