<a href="https://colab.research.google.com/github/HillaryDrugs/li7/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================================
# STEP 0: IMPORTS
# ============================================================================
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# ============================================================================
# STEP 1: LOAD THE DATASET
# ============================================================================
print("=" * 70)
print("STEP 1: Loading SMS Spam dataset")
print("=" * 70)


df = pd.read_csv("/content/spam.csv", encoding="cp1252")

# Keep only useful columns
df = df[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})

print("First 5 rows of the dataset:")
print(df.head(), "\n")

print(f"Total messages: {len(df)}")
print(f"Unique labels: {df['label'].unique()}")

# Map labels to numeric for reporting:
# ham  -> 0
# spam -> 1
label_map = {"ham": 0, "spam": 1}
df["label_num"] = df["label"].map(label_map)

texts = df["text"].tolist()
labels = df["label_num"].tolist()

print("\nExample HAM message:")
print(df[df['label'] == 'ham']['text'].iloc[0])

print("\nExample SPAM message:")
print(df[df['label'] == 'spam']['text'].iloc[0])

# ============================================================================
# STEP 2: TRAIN / TEST SPLIT
# ============================================================================
print("\n" + "=" * 70)
print("STEP 2: Split into train and test sets")
print("=" * 70)

X_train, X_test, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.2,        # 20% test
    random_state=42,      # reproducible
    stratify=labels       # preserve ham/spam ratio
)

print(f"Training set size: {len(X_train)} messages")
print(f"Test set size:     {len(X_test)} messages")
print(f"Training spam count: {sum(y_train)} / {len(y_train)}")
print(f"Test spam count:     {sum(y_test)} / {len(y_test)}")

# ============================================================================
# STEP 3: BUILD THE SVM CLASSIFIER
# ============================================================================
print("\n" + "=" * 70)
print("STEP 3: The SVM - Finding the best separating boundary")
print("=" * 70)

print("""
We are building a text classifier:
- 'ham'  = normal message
- 'spam' = unwanted ad / scam

Pipeline:
1. TF-IDF turns each SMS into numeric features
2. Linear SVM draws a boundary between ham and spam
""")

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,      # allow rich vocabulary
        min_df=2,               # ignore words that appear only once
        ngram_range=(1, 2),     # use 1-word and 2-word phrases
        stop_words='english'    # remove common filler words
    )),
    ('svm', SVC(
        kernel='linear',        # linear SVM works very well for text
        C=1.0,
        random_state=42
    ))
])

# ============================================================================
# STEP 4: TRAIN THE MODEL
# ============================================================================
print("\n" + "=" * 70)
print("STEP 4: Training the SVM on spam vs ham")
print("=" * 70)

print("Training in progress...")
pipeline.fit(X_train, y_train)
print("✓ Training complete!\n")

# ============================================================================
# STEP 5: EVALUATE THE MODEL
# ============================================================================
print("=" * 70)
print("STEP 5: Testing - How well did it learn?")
print("=" * 70)

predictions = pipeline.predict(X_test)

accuracy = np.mean(predictions == y_test)
print(f"\nACCURACY: {accuracy * 100:.2f}%")
print(f"(Got {sum(predictions == y_test)} out of {len(y_test)} correct)\n")

cm = confusion_matrix(y_test, predictions)
print("CONFUSION MATRIX:")
print("                    Predicted HAM (0)    Predicted SPAM (1)")
print(f"Actually HAM (0):          {cm[0][0]:4d}                {cm[0][1]:4d}")
print(f"Actually SPAM (1):         {cm[1][0]:4d}                {cm[1][1]:4d}")
print()

print("DETAILED CLASSIFICATION REPORT:")
print(classification_report(
    y_test,
    predictions,
    target_names=['HAM (not spam)', 'SPAM']
))

# ============================================================================
# STEP 6: TRY IT ON NEW MESSAGES
# ============================================================================
print("=" * 70)
print("STEP 6: Try with brand new messages")
print("=" * 70)

new_messages = [
    "WIN a brand new iPhone! Reply YES to claim your prize now!!!",
    "Hey are we still meeting at 7 or should I come later?",
    "URGENT! Your account is compromised. Click this link immediately to verify.",
    "Ok I'm home, text me when you arrive.",
    "You have won $5000 cash. Call now to receive your reward."
]

for msg in new_messages:
    pred = pipeline.predict([msg])[0]
    label_text = "SPAM 🚨" if pred == 1 else "HAM ✅"
    print(f"Message: {msg}")
    print(f"Prediction: {label_text}\n")

# ============================================================================
# STEP 7: INTERPRET THE MODEL (MOST SPAMMY WORDS)
# ============================================================================
print("=" * 70)
print("STEP 7: What words scream SPAM vs HAM?")
print("=" * 70)

print("""
In a linear SVM:
- Positive weight  -> pushes toward class 1 (SPAM)
- Negative weight  -> pushes toward class 0 (HAM)

We're going to inspect the learned weights.
""")

# Get trained vectorizer + SVM
tfidf_vectorizer = pipeline.named_steps['tfidf']
svm_model = pipeline.named_steps['svm']

feature_names = tfidf_vectorizer.get_feature_names_out()

# svm_model.coef_ might be sparse depending on backend.
# We convert it to a dense flat numpy array so it's always printable.
coef_matrix = svm_model.coef_

# If it's sparse, make it dense
if hasattr(coef_matrix, "toarray"):
    coef_dense = coef_matrix.toarray()[0]
else:
    coef_dense = coef_matrix[0]

# Now we have plain floats we can format
word_weights = list(zip(feature_names, coef_dense))

# Sort by weight descending:
# high positive weight => more "spammy"
word_weights_sorted = sorted(word_weights, key=lambda x: x[1], reverse=True)

print("\nTOP 15 SPAM WORDS (high => more likely SPAM):")
for word, weight in word_weights_sorted[:15]:
    print(f"  '{word}': {float(weight):.4f}")

print("\nTOP 15 HAM WORDS (low => more likely normal / HAM):")
for word, weight in word_weights_sorted[-15:]:
    print(f"  '{word}': {float(weight):.4f}")




STEP 1: Loading SMS Spam dataset
First 5 rows of the dataset:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro... 

Total messages: 5572
Unique labels: ['ham' 'spam']

Example HAM message:
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

Example SPAM message:
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

STEP 2: Split into train and test sets
Training set size: 4457 messages
Test set size:     1115 messages
Training spam count: 598 / 4457
Test spam count:     149 / 1115

STEP 3: The SVM - Finding the best separating b