<a href="https://colab.research.google.com/github/IncharaG26/AIML_LAB/blob/main/Lab5_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
"""
sms_spam_tfidf_logreg.py
TF-IDF + Logistic Regression spam detector.
Prints precision/recall/F1, accuracy, and top positive/negative features.
Saves confusion matrix figure to outputs/sms/cm_sms.png.
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

# === Dataset sources ===
LOCAL_PATH = "SMSSpamCollection"  # local fallback file
REMOTE_TSV = (
    "https://raw.githubusercontent.com/ozlerhakan/machine_learning_datasets/master/sms_spam.csv"
)  # backup remote dataset (CSV with same structure)


# === Load dataset ===
def load_data():
    """Load SMS Spam dataset, trying local file first, then remote."""
    if os.path.exists(LOCAL_PATH):
        print(f"Loading local dataset from {LOCAL_PATH} ...")
        df = pd.read_csv(LOCAL_PATH, sep="\t", header=None, names=["label", "message"])
        return df

    try:
        print(f"Loading remote dataset from {REMOTE_TSV} ...")
        df = pd.read_csv(REMOTE_TSV)
        # If column names differ, rename to 'label' and 'message'
        if not {"label", "message"}.issubset(df.columns):
            df.columns = ["label", "message"]
        return df
    except Exception as e:
        raise RuntimeError(
            "Could not load SMS dataset.\n"
            "Place 'SMSSpamCollection' next to this script "
            "or ensure internet is available."
        ) from e


# === Main workflow ===
def main():
    # Prepare output folder
    os.makedirs("outputs/sms", exist_ok=True)

    # Load dataset
    df = load_data()
    print(f"Loaded {len(df)} messages ({df['label'].value_counts().to_dict()})")

    # Split data into train/test sets
    X_train, X_test, y_train, y_test = train_test_split(
        df["message"].values,
        df["label"].values,
        test_size=0.2,
        random_state=42,
        stratify=df["label"].values,
    )

    # TF-IDF vectorizer setup
    vect = TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 2),
        max_df=0.95,
        min_df=2,
    )

    Xtr = vect.fit_transform(X_train)
    Xte = vect.transform(X_test)

    # Logistic Regression classifier
    clf = LogisticRegression(penalty="l2", solver="liblinear", max_iter=1000)
    clf.fit(Xtr, y_train)

    # Predict test set
    y_pred = clf.predict(Xte)

    # === Evaluation ===
    print("\n=== Classification Report (Test Set) ===")
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred, labels=["ham", "spam"])
    disp = ConfusionMatrixDisplay(cm, display_labels=["ham", "spam"])
    disp.plot(cmap="Blues")
    plt.title("SMS Spam Confusion Matrix")
    plt.tight_layout()
    plt.savefig("outputs/sms/cm_sms.png", dpi=200)
    plt.close()
    print(" Saved confusion matrix to outputs/sms/cm_sms.png")

    # === Feature Analysis ===
    feature_names = np.array(vect.get_feature_names_out())

    # For binary classification, coef_ has shape (1, n_features)
    coefs = clf.coef_[0]

    # Top 20 spam- and ham-indicative features
    top_pos = np.argsort(coefs)[-20:][::-1]
    top_neg = np.argsort(coefs)[:20]

    print("\n=== Top 20 SPAM-indicative features ===")
    for f, w in zip(feature_names[top_pos], coefs[top_pos]):
        print(f"{f:30s} {w: .3f}")

    print("\n=== Top 20 HAM-indicative features ===")
    for f, w in zip(feature_names[top_neg], coefs[top_neg]):
        print(f"{f:30s} {w: .3f}")


# === Run the script ===
if __name__ == "__main__":
    main()


Loading local dataset from SMSSpamCollection ...
Loaded 5572 messages ({'ham': 4825, 'spam': 747})

=== Classification Report (Test Set) ===
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Test Accuracy: 0.9695
 Saved confusion matrix to outputs/sms/cm_sms.png

=== Top 20 SPAM-indicative features ===
txt                             4.305
mobile                          3.542
uk                              3.470
claim                           3.445
www                             3.276
reply                           3.066
stop                            3.025
free                            2.956
service                         2.832
150p                            2.718
prize                           2.561
t