In [7]:
# Import essential libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
try:
    df = pd.read_csv('spam.csv', encoding='latin-1')  # For real dataset
    df = df[['v1', 'v2']].rename(columns={'v1':'label', 'v2':'text'})
except:
    # Fallback to test data
    test_data = {
        "text": [
            "WINNER!! Claim your free prize now!",
            "Meeting at 3pm in conference room",
            "URGENT! Your account needs verification",
            "Hi John, attached is the report",
            "Congratulations! You won a gift card"
        ],
        "label": ["spam", "ham", "spam", "ham", "spam"]
    }
    df = pd.DataFrame(test_data)

# Display first 5 rows
df.head()

Unnamed: 0,text,label
0,WINNER!! Claim your free prize now!,spam
1,Meeting at 3pm in conference room,ham
2,URGENT! Your account needs verification,spam
3,"Hi John, attached is the report",ham
4,Congratulations! You won a gift card,spam


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.corpus import stopwords
import nltk

# Download stopwords (run once)
nltk.download('stopwords')

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GoldH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
# Option 1: Count Vectorizer
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(df['cleaned_text'])

# Option 2: TF-IDF (often better performance)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

# See vocabulary
print("Top features:", list(tfidf_vectorizer.vocabulary_.items())[:10])

Top features: [('winner', np.int64(18)), ('claim', np.int64(4)), ('free', np.int64(7)), ('prize', np.int64(13)), ('meeting', np.int64(11)), ('3pm', np.int64(0)), ('conference', np.int64(5)), ('room', np.int64(15)), ('urgent', np.int64(16)), ('account', np.int64(1))]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# First check your data shape
print(f"Total samples: {len(X)}")
print(f"Class distribution:\n{y.value_counts()}")

# If dataset is too small (e.g., <50 samples), use leave-one-out instead
if len(X) < 50:
    from sklearn.model_selection import LeaveOneOut
    loo = LeaveOneOut()
    accuracies = []
    
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = MultinomialNB()
        model.fit(X_train, y_train)
        accuracies.append(accuracy_score(y_test, model.predict(X_test)))
    
    print(f"Leave-One-Out Accuracy: {np.mean(accuracies):.2f}")
else:
    # Use stratified split for larger datasets
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=0.2, 
            random_state=42,
            stratify=y
        )
    except ValueError:
        # Fallback to random split if stratification fails
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=0.2, 
            random_state=42
        )
    
    # Train and evaluate
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

ValueError: The test_size = 1 should be greater or equal to the number of classes = 2

In [12]:
def predict_spam(email_text):
    # Preprocess
    cleaned_text = preprocess_text(email_text)
    # Vectorize
    email_vec = tfidf_vectorizer.transform([cleaned_text])
    # Predict
    prediction = model.predict(email_vec)
    proba = model.predict_proba(email_vec)
    
    print(f"Text: {email_text}")
    print(f"Prediction: {prediction[0]}")
    print(f"Probability: {max(proba[0]):.2%}")
    print("---")

# Test cases
test_emails = [
    "Free lottery win $1000 now!",
    "Hi team, meeting tomorrow at 10am",
    "URGENT: Your account has been compromised",
    "Please find attached the monthly report"
]

for email in test_emails:
    predict_spam(email)

Text: Free lottery win $1000 now!
Prediction: spam
Probability: 79.26%
---
Text: Hi team, meeting tomorrow at 10am
Prediction: spam
Probability: 64.12%
---
Text: URGENT: Your account has been compromised
Prediction: spam
Probability: 80.86%
---
Text: Please find attached the monthly report
Prediction: spam
Probability: 57.29%
---
