In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load data
try:
    data = pd.read_csv(
        "spam.csv",
        encoding="latin-1",
        usecols=[0, 1],
        names=["label", "message"],
        header=0,
    )
except FileNotFoundError:
    print("Error: 'spam.csv' not found in the current directory.")
    import sys
    sys.exit(1)

# Encode labels
data["label_encoded"] = data["label"].map({"ham": 0, "spam": 1})

print("— Data sample —")
print(data.head())
print(f"\nTotal: {len(data)}")
vc = data["label"].value_counts()
print(f"Spam: {vc.get('spam', 0)}")
print(f"Ham: {vc.get('ham', 0)}")
print("-" * 30)

# NLTK resources
for res in ["stopwords", "wordnet"]:
    try:
        nltk.data.find(f"corpora/{res}")
    except LookupError:
        nltk.download(res, quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text: str) -> str:
    text = re.sub(r"[^\w\s]", " ", str(text))
    text = re.sub(r"\d+", " ", text).lower()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

data["clean_message"] = data["message"].apply(preprocess_text)

print("— Example —")
print(f"Original: {data['message'].iloc[1]}")
print(f"Cleaned : {data['clean_message'].iloc[1]}")
print("-" * 30)

# Features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data["clean_message"])
y = data["label_encoded"]

print(f"X shape: {X.shape}")
print("-" * 30)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model
model = LogisticRegression(solver="liblinear", random_state=42)
print("Training...")
model.fit(X_train, y_train)

# Eval
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n— Evaluation —")
print("Model: Logistic Regression (TF-IDF)")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["ham", "spam"]))
print("-" * 30)

def predict_sms(sms_text: str):
    clean_text = preprocess_text(sms_text)
    X_new = tfidf_vectorizer.transform([clean_text])
    prediction = model.predict(X_new)[0]
    label = "SPAM" if prediction == 1 else "HAM"
    print(f"\nSMS: {sms_text!r}")
    print(f"Predicted: {label}")

# Quick checks
predict_sms("SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info")
predict_sms("Hey, want to grab lunch tomorrow? I'll be free around 1pm.")


— Data sample —
  label                                            message  label_encoded
0   ham  Go until jurong point, crazy.. Available only ...              0
1   ham                      Ok lar... Joking wif u oni...              0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...              1
3   ham  U dun say so early hor... U c already then say...              0
4   ham  Nah I don't think he goes to usf, he lives aro...              0

Total: 5572
Spam: 747
Ham: 4825
------------------------------
— Example —
Original: Ok lar... Joking wif u oni...
Cleaned : ok lar joking wif u oni
------------------------------
X shape: (5572, 5000)
------------------------------
Training...

— Evaluation —
Model: Logistic Regression (TF-IDF)
Accuracy: 0.9543

Classification Report:
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.69      0.80       150

    accuracy                         