<a href="https://colab.research.google.com/github/Manya87/spam_detector/blob/main/ML_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================================================================
# 1. SETUP AND DATA ACQUISITION
# ==============================================================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np
import warnings
warnings.filterwarnings('ignore') # Suppress minor warnings for clean output

# --- Data Download (Colab specific) ---
# Fetches the raw SMS Spam Collection dataset from a reliable source.
# The file format is Tab-Separated Values (TSV).
!wget -nc https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv
!mv sms.tsv spam_data.csv

print("‚úÖ Environment setup complete. Dataset 'spam_data.csv' ready.")

--2025-11-06 07:07:09--  https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 477907 (467K) [text/plain]
Saving to: ‚Äòsms.tsv‚Äô


2025-11-06 07:07:09 (13.1 MB/s) - ‚Äòsms.tsv‚Äô saved [477907/477907]

‚úÖ Environment setup complete. Dataset 'spam_data.csv' ready.


In [2]:
# ==============================================================================
# 2. DATA LOADING, CLEANING, AND PREPARATION
# ==============================================================================

# 1. Load the Tab-Separated Data
# The dataset is loaded without a header, specifying columns 0 and 1.
data = pd.read_csv(
    'spam_data.csv',
    sep='\t',
    header=None,
    encoding='latin-1',
    names=['Label', 'Message']
)

# 2. Check and Prepare Data
print(f"Initial Dataset Shape: {data.shape}")
print(data.head())

# 3. Label Encoding (Categorical to Numerical)
# Why: ML algorithms require numerical input for the target variable (y).
label_encoder = LabelEncoder()
data['Target'] = label_encoder.fit_transform(data['Label'])

# Display the mapping to understand the output: 0 or 1
print(f"\nLabel Mapping: {list(label_encoder.classes_)} -> {label_encoder.transform(label_encoder.classes_)}")

# 4. Define Features (X) and Target (y)
X = data['Message'] # The raw text messages
y = data['Target']   # The numerical labels (0 or 1)

Initial Dataset Shape: (5572, 2)
  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Label Mapping: ['ham', 'spam'] -> [0 1]


In [3]:
# ==============================================================================
# 3. FEATURE ENGINEERING (CountVectorizer) AND DATA SPLIT
# ==============================================================================

# 1. Split Data into Training and Testing Sets
# Why: To evaluate the model on unseen data. random_state ensures reproducible results.
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print(f"Data Split: Training={len(X_train)}, Testing={len(X_test)}")

# 2. Initialize CountVectorizer (Bag-of-Words)
# Why: Converts text into a matrix of word counts.
# 'stop_words' is used to remove common, non-informative words like 'a', 'the', 'is'.
vectorizer = CountVectorizer(stop_words='english')

# 3. Fit and Transform Training Data
# The .fit() step builds the vocabulary dictionary ONLY from the training data.
X_train_vectorized = vectorizer.fit_transform(X_train)

# 4. Transform Testing Data
# Use the *fitted* vocabulary from the training step to transform the test data.
X_test_vectorized = vectorizer.transform(X_test)

print("‚úÖ Feature Engineering Complete (CountVectorizer).")
print(f"Total Vocabulary Size (Features): {len(vectorizer.get_feature_names_out())}")
print(f"Vectorized Training Data Shape: {X_train_vectorized.shape}")

Data Split: Training=4457, Testing=1115
‚úÖ Feature Engineering Complete (CountVectorizer).
Total Vocabulary Size (Features): 7473
Vectorized Training Data Shape: (4457, 7473)


In [4]:
# ==============================================================================
# 4. MODEL TRAINING AND EVALUATION
# ==============================================================================

# 1. Initialize and Train the Model
# Why MultinomialNB: It's excellent for classification problems based on word counts.
spam_classifier = MultinomialNB()
spam_classifier.fit(X_train_vectorized, y_train)

print("‚úÖ Model Trained successfully using Multinomial Naive Bayes.")

# 2. Generate Predictions
y_pred = spam_classifier.predict(X_test_vectorized)

# 3. Evaluation Metrics
print("\n--- Model Performance Metrics ---")

# Overall Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

# Classification Report (Detailing Precision/Recall for each class)
target_labels = ['Ham (0)', 'Spam (1)']
report = classification_report(y_test, y_pred, target_names=target_labels)
print("\nClassification Report:\n", report)

# Confusion Matrix (Visualizing Errors)
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
# Custom print for better readability in a console/colab output
print("                   Predicted Ham (0)   Predicted Spam (1)")
print(f"Actual Ham (0):      {cm[0][0]:<17}   {cm[0][1]}")
print(f"Actual Spam (1):     {cm[1][0]:<17}   {cm[1][1]}")

‚úÖ Model Trained successfully using Multinomial Naive Bayes.

--- Model Performance Metrics ---
Overall Accuracy: 98.92%

Classification Report:
               precision    recall  f1-score   support

     Ham (0)       0.99      1.00      0.99       966
    Spam (1)       0.97      0.95      0.96       149

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115


Confusion Matrix:
                   Predicted Ham (0)   Predicted Spam (1)
Actual Ham (0):      962                 4
Actual Spam (1):     8                   141


In [None]:
# ==============================================================================
# 5. DEPLOYABLE PREDICTION FUNCTION (INTERACTIVE)
# ==============================================================================

# Note: This function relies on the 'vectorizer' and 'spam_classifier' variables
# defined and trained in the previous cells.

def spam_predictor(email_message: str) -> str:
    """
    Takes a raw string email message, processes it using the trained vectorizer,
    and returns a clean 'Spam' or 'Ham' classification using the trained model.
    """
    # Step 1: Prepare the input (wrap the string in a list)
    input_list = [email_message]

    # Step 2: Transform the input using the FITTED vectorizer
    input_vectorized = vectorizer.transform(input_list)

    # Step 3: Get the numerical prediction from the trained classifier
    # [0] is used to extract the single prediction from the returned array
    numerical_prediction = spam_classifier.predict(input_vectorized)[0]

    # Step 4: Return the human-readable label
    if numerical_prediction == 0:
        return "‚û°Ô∏è NOT SPAM (Ham)"
    else:
        return "‚ùå SPAM"


if __name__ == "__main__":
    print("\n" + "="*50)
    print("      üìß INTERACTIVE SPAM DETECTION TOOL üìß")
    print("="*50)
    print("Type 'exit' or 'quit' to stop the prediction tool.")

    while True:
        # Prompt user for input
        user_input = input("\nPaste your email or message here: ")

        # Check for exit command
        if user_input.lower() in ['exit', 'quit']:
            print("\nShutting down the predictor. Goodbye!")
            break

        # Ensure the user actually entered something
        if not user_input.strip():
            print("Please enter a message to classify.")
            continue

        # Get and display the prediction
        try:
            prediction = spam_predictor(user_input)
            print(f"\nModel Classification: {prediction}")
        except Exception as e:
            print(f"\nAn error occurred during prediction: {e}")


      üìß INTERACTIVE SPAM DETECTION TOOL üìß
Type 'exit' or 'quit' to stop the prediction tool.
