<a href="https://colab.research.google.com/github/Mohint/Generative-AI/blob/main/Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio
import gradio as gr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import pickle
import os

# Ensure NLTK stopwords are downloaded
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

def preprocess_text(text):
    """
    Cleans and preprocesses text by lowercasing, removing non-alphabetic chars,
    and removing stopwords.
    """
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters, keep spaces
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# --- 1. Define the Dataset Directly in Code ---
# This is our small, predefined dataset for demonstration
data = {
    'label': [
        'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham',
        'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham'
    ],
    'message': [
        "Hey, how are you doing today?",
        "WINNER! You've won a £1000 cash prize! Text CLAIM to 800-888 to claim. T&Cs apply.",
        "Just got home, cooking dinner now.",
        "URGENT! Your account has been suspended. Click this link to reactivate now: http://malicious.link",
        "Can we meet up tomorrow?",
        "Don't forget to buy milk.",
        "Free entry to our exclusive competition for a chance to win a new car. Reply YES to 87878.",
        "Sounds good! See you then.",
        "Congratulations! You've been selected for a free holiday to Hawaii. Call 09061701549.",
        "I'm at the library, studying.",
        "Did you get my last text?",
        "Cash a prize from our company! Send your bank details for transfer now! Click: http://fakelinks.com",
        "Okay, no problem.",
        "What time is the movie?",
        "Running a bit late, be there in 10.",
        "Limited time offer! Get 50% off all products. Visit our store at www.scamshop.net.",
        "Happy birthday!",
        "Let's grab coffee soon.",
        "You have 1 new voicemail. Call +123456789 to retrieve. Cost £1.50/min.",
        "Finished work early today."
    ]
}
df = pd.DataFrame(data)

print("--- Starting Model Training and Saving Process with In-Code Data ---")

# Preprocess messages and convert labels
df['message'] = df['message'].apply(preprocess_text)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split data for training and testing
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
# X_test_vectorized = vectorizer.transform(X_test) # Not strictly needed for this app if only saving model

# Train a Multinomial Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Evaluate the model (optional, for verification)
# y_pred = model.predict(X_test_vectorized)
# print(f"Model accuracy on test set: {accuracy_score(y_test, y_pred):.2f}")

# Save the trained model and vectorizer to .pkl files
# These files will be loaded by the Gradio app
with open('spam_model_in_code.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('tfidf_vectorizer_in_code.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model (spam_model_in_code.pkl) and Vectorizer (tfidf_vectorizer_in_code.pkl) saved successfully!")
print("These files are now ready for the Gradio app.")

# --- 2. Load Predefined Model and Vectorizer (for the Gradio App) ---
# This part executes when the script is run and serves the Gradio app

try:
    with open('spam_model_in_code.pkl', 'rb') as f:
        loaded_model = pickle.load(f)
    with open('tfidf_vectorizer_in_code.pkl', 'rb') as f:
        loaded_vectorizer = pickle.load(f)
    print("Predefined model and vectorizer loaded successfully for inference!")
except FileNotFoundError:
    print("Error: Model or vectorizer files not found.")
    print("Ensure the training section above ran successfully and created 'spam_model_in_code.pkl' and 'tfidf_vectorizer_in_code.pkl'.")
    exit()

def predict_spam(message):
    """
    Predicts whether a message is spam or not using the loaded model.
    """
    if not message:
        return "Please enter a message to analyze."

    preprocessed_message = preprocess_text(message)
    # Transform the single message using the loaded vectorizer
    vectorized_message = loaded_vectorizer.transform([preprocessed_message])
    prediction = loaded_model.predict(vectorized_message)[0]

    if prediction == 1:
        return "SPAM! 🚨"
    else:
        return "Not Spam (HAM) ✅"

# Create the Gradio interface
iface = gr.Interface(
    fn=predict_spam,
    inputs=gr.Textbox(lines=5, label="Enter Message Here", placeholder="Type a message to check if it's spam..."),
    outputs=gr.Label(label="Prediction"),
    title="SMS Spam Detector (In-Code Data)",
    description="This app uses a machine learning model trained on a small dataset embedded directly in the code."
)

# Launch the Gradio app
# Use share=True if running in Google Colab to get a public URL.
# For local execution, you can use share=False or omit it.
if __name__ == "__main__":
    print("\n--- Launching Gradio App ---")
    iface.launch(share=True)

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


--- Starting Model Training and Saving Process with In-Code Data ---
Model (spam_model_in_code.pkl) and Vectorizer (tfidf_vectorizer_in_code.pkl) saved successfully!
These files are now ready for the Gradio app.
Predefined model and vectorizer loaded successfully for inference!

--- Launching Gradio App ---
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b642face9d583ecfd3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
