<a href="https://colab.research.google.com/github/Liza-IITP/Linear-Logistic/blob/main/Spam_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score




In [6]:
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
df = load_dataset("ucirvine/sms_spam")

df = pd.DataFrame(df['train'])
X = df['sms']
y = df['label']

X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer(binary=True, stop_words="english", min_df=5)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

clf = BernoulliNB(alpha=1.0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["ham", "spam"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nAdditional Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]):.4f}")

samples = ["claim your FREE prize now!!!", "hey, are we still on for lunch?","win win prize claim voucher"]
X_samples = vectorizer.transform(samples)
predictions = clf.predict(X_samples)
probs = clf.predict_proba(X_samples)[:,1]
print("\nCustom Message Predictions:")
for msg, pred, prob in zip(samples, predictions, probs):
    print(f"Message: {msg}")
    print(f"Prediction: {'spam' if pred == 1 else 'ham'}")
    print(f"Spam Probability: {prob:.4f}\n")

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       954
        spam       0.99      0.89      0.94       161

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[953   1]
 [ 17 144]]

Additional Metrics:
Accuracy: 0.9839
Precision: 0.9931
Recall: 0.8944
F1-Score: 0.9412
ROC-AUC Score: 0.9919

Custom Message Predictions:
Message: claim your FREE prize now!!!
Prediction: spam
Spam Probability: 0.9963

Message: hey, are we still on for lunch?
Prediction: ham
Spam Probability: 0.0000

Message: win win prize claim voucher
Prediction: spam
Spam Probability: 1.0000

