In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset from CSV
def load_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    return data

# Data preprocessing and splitting into train and test sets
def preprocess_and_split(data):
    X = data['v2']  # raw text
    y = data['v1']  # labels

    # Convert labels to binary values: 0 for 'ham', 1 for 'spam'
    y = y.map({'ham': 0, 'spam': 1})
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Train the Naive Bayes classifier
def train_classifier(X_train, y_train):
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    classifier = MultinomialNB()
    classifier.fit(X_train_vectorized, y_train)
    return classifier, vectorizer

# Test the classifier and get accuracy
def test_classifier(classifier, vectorizer, X_test, y_test):
    X_test_vectorized = vectorizer.transform(X_test)
    y_pred = classifier.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

def test_sample_emails(classifier, vectorizer):
    sample_emails = [
        "Congratulations! You have won a free iPhone. Reply now to claim.",
        "Hi, just checking to see how you're doing. Let's catch up.",
        "URGENT: Your account will be locked if you don't verify your details.",
        "Hey, Let's meet tomorrow at 10 AM.",
    ]

    # Labels: 0 for 'ham', 1 for 'spam'
    actual_labels = [1, 0, 1, 0]
    sample_emails_vectorized = vectorizer.transform(sample_emails)
    predicted_labels = classifier.predict(sample_emails_vectorized)
    predicted_labels = pd.Series(predicted_labels).map({0: 'ham', 1: 'spam'})

    # Print the predictions along with the actual labels
    print("\nSample Email Predictions:")
    for i in range(len(sample_emails)):
        print(f"Email: {sample_emails[i]}")
        print(f"Predicted Label: {predicted_labels[i]}, Actual Label: {'spam' if actual_labels[i] == 1 else 'ham'}")
        print("--------------------------")


if __name__ == "__main__":
    data = load_data('spam.csv')
    X_train, X_test, y_train, y_test = preprocess_and_split(data)
    classifier, vectorizer = train_classifier(X_train, y_train)
    accuracy, report = test_classifier(classifier, vectorizer, X_test, y_test)
    print(f"Accuracy: {accuracy}")
    print("Classification Report:\n", report)
    #code for testing sample emails
    test_sample_emails(classifier, vectorizer)


Accuracy: 0.9838565022421525
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Sample Email Predictions:
Email: Congratulations! You have won a free iPhone. Reply now to claim.
Predicted Label: spam, Actual Label: spam
--------------------------
Email: Hi, just checking to see how you're doing. Let's catch up.
Predicted Label: ham, Actual Label: ham
--------------------------
Email: URGENT: Your account will be locked if you don't verify your details.
Predicted Label: spam, Actual Label: spam
--------------------------
Email: Hey, Let's meet tomorrow at 10 AM.
Predicted Label: ham, Actual Label: ham
--------------------------
