In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load dataset
# You can use a dataset like the SpamAssassin public corpus or any other labeled spam dataset
url = 'https://raw.githubusercontent.com/epfml/ML-Projects/master/spam_ham_dataset.csv'
df = pd.read_csv(url)

# Display the first few rows of the dataset
print("Dataset Sample:")
print(df.head())

# Data Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", " ", text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing to the 'text' column
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Split the dataset into features and labels
X = df['cleaned_text']
y = df['label'].map({'ham': 0, 'spam': 1})  # Map labels to binary values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Training
# You can choose any of these models: Naive Bayes, Logistic Regression, or SVM
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(kernel='linear')
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Function to predict if an email is spam or ham
def predict_spam(email):
    cleaned_email = preprocess_text(email)
    email_tfidf = vectorizer.transform([cleaned_email])
    prediction = models['Naive Bayes'].predict(email_tfidf)
    return "Spam" if prediction[0] == 1 else "Ham"

# Command-line interface for user input
def main():
    while True:
        user_input = input("\nEnter an email message (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        print(f"Prediction: {predict_spam(user_input)}")

# Run the command-line interface
if __name__ == "__main__":
    main()