In [None]:
# spam_model.py

import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import joblib 


# -------------------------------
# 1. Load and clean dataset
# -------------------------------
csv_path = "/data/spam.csv"   

df = pd.read_csv(csv_path)

# rename to standard names
df = df.rename(columns={"Category": "label", "Message": "text"})
df = df[["label", "text"]].dropna()

# clean labels
df["label"] = df["label"].astype(str).str.lower().str.strip()
df["text"] = df["text"].astype(str)

print("Raw label counts:")
print(df["label"].value_counts())

# keep only ham + spam
df = df[df["label"].isin(["ham", "spam"])].copy()

print("\nFiltered label counts (ham & spam only):")
print(df["label"].value_counts())
print(f"\nTotal rows after filter: {len(df)}")


# -------------------------------
# 2. Train-test split
# -------------------------------
label_counts = df["label"].value_counts()
min_class = label_counts.min()
stratify_arg = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=stratify_arg,
)

print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")


# -------------------------------
# 3. Define models
# -------------------------------
models = {
    "logistic_regression": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("clf", LogisticRegression(max_iter=1000)),
    ]),
    "naive_bayes": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("clf", MultinomialNB()),
    ]),
    "linear_svm": Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("clf", LinearSVC()),
    ]),
}


# -------------------------------
# 4. Train, evaluate, save
# -------------------------------
os.makedirs("models", exist_ok=True)

for name, model in models.items():
    print(f"\n===== Training {name} =====")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="macro", zero_division=0
    )

    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1-score : {f1:.4f}")

    save_path = os.path.join("models", f"{name}_spam.h5")
    joblib.dump(model, save_path)
    print(f"âœ… Saved model to: {save_path}")

print("\nðŸŽ‰ All models trained and saved.")


Raw label counts:
label
ham               4825
spam               747
{"mode":"full"       1
Name: count, dtype: int64

Filtered label counts (ham & spam only):
label
ham     4825
spam     747
Name: count, dtype: int64

Total rows after filter: 5572

Train size: 4457, Test size: 1115

===== Training logistic_regression =====
Accuracy : 0.9668
Precision: 0.9816
Recall   : 0.8758
F1-score : 0.9197
âœ… Saved model to: models\logistic_regression_spam.h5

===== Training naive_bayes =====
Accuracy : 0.9695
Precision: 0.9830
Recall   : 0.8859
F1-score : 0.9270
âœ… Saved model to: models\naive_bayes_spam.h5

===== Training linear_svm =====
Accuracy : 0.9821
Precision: 0.9833
Recall   : 0.9386
F1-score : 0.9594
âœ… Saved model to: models\linear_svm_spam.h5

ðŸŽ‰ All models trained and saved.
