<a href="https://colab.research.google.com/github/Mandira124/Machine-Learning-Beginner-Projects/blob/main/Spam_emails_classifier_using_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

# 1. Load dataset from Hugging Face
dataset = load_dataset("SetFit/enron_spam")

# Convert to pandas DataFrame
df = pd.DataFrame(dataset["train"])
print("Dataset preview:")
print(df.head())

# Check column names
print("Columns:", df.columns)

# 2. Prepare training and testing sets
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Vectorize text using TF-IDF
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 4. Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# 5. Evaluate model performance
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 6. Save model and vectorizer for later use
joblib.dump(model, "spam_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("Model and vectorizer saved successfully.")

# 7. Function to predict whether a message is spam
def predict_spam(message):
    """
    Predicts if an email is spam.
    Returns the label ('SPAM' or 'NOT SPAM') and the probability.
    """
    message_vec = tfidf.transform([message])
    probability = model.predict_proba(message_vec)[0][1]
    return ("SPAM", probability) if probability >= 0.5 else ("NOT SPAM", probability)

# Example usage
print(predict_spam("Congratulations! You've won a free ticket!"))
print(predict_spam("Hey John, the meeting is at 2pm tomorrow."))


Repo card metadata block was not found. Setting CardData to empty.


Dataset preview:
   message_id                                               text  label  \
0       33214  any software just for 15 $ - 99 $ understandin...      1   
1       11929  perspective on ferc regulatory action client c...      0   
2       19784  wanted to try ci 4 lis but thought it was way ...      1   
3        2209  enron / hpl actuals for december 11 , 2000 tec...      0   
4       15880  looking for cheap high - quality software ? ro...      1   

  label_text                                            subject  \
0       spam                  any software just for 15 $ - 99 $   
1        ham  perspective on ferc regulatory action client c...   
2       spam  wanted to try ci 4 lis but thought it was way ...   
3        ham         enron / hpl actuals for december 11 , 2000   
4       spam  looking for cheap high - quality software ? ro...   

                                             message       date  
0  understanding oem software\nlead me not into t... 2005-06-18