<a href="https://colab.research.google.com/github/Neeharika226/SpamShield/blob/main/Email_spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]  # We only need the label (v1) and text (v2) columns
df.columns = ['label', 'text']

# Preprocessing function
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], df['label'], test_size=0.2, random_state=42
)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train the model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
with open('spam_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9695067264573991

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [3]:
from flask import Flask, request, render_template, jsonify
import pickle
import os

# Load the saved model and vectorizer
with open('spam_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

# Preprocessing function (same as before)
def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Create Flask app
app = Flask(__name__)

@app.route('/')
def home():
    return """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Email Spam Detector</title>
        <style>
            body { font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; padding: 20px; }
            h1 { color: #333; text-align: center; }
            .container { background-color: #f9f9f9; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
            textarea { width: 100%; height: 150px; padding: 10px; margin-bottom: 10px; border: 1px solid #ddd; border-radius: 4px; }
            button { background-color: #4CAF50; color: white; padding: 10px 15px; border: none; border-radius: 4px; cursor: pointer; }
            button:hover { background-color: #45a049; }
            .result { margin-top: 20px; padding: 10px; border-radius: 4px; }
            .spam { background-color: #ffdddd; color: #d8000c; }
            .ham { background-color: #ddffdd; color: #4F8A10; }
        </style>
    </head>
    <body>
        <h1>Email Spam Detection</h1>
        <div class="container">
            <p>Enter an email message to check if it's spam or ham (legitimate):</p>
            <textarea id="message" placeholder="Paste your email here..."></textarea>
            <button onclick="checkSpam()">Check for Spam</button>
            <div id="result" class="result" style="display: none;"></div>
        </div>

        <script>
            function checkSpam() {
                const message = document.getElementById('message').value;
                if (!message) {
                    alert('Please enter a message to check');
                    return;
                }

                fetch('/predict', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json',
                    },
                    body: JSON.stringify({ message: message })
                })
                .then(response => response.json())
                .then(data => {
                    const resultDiv = document.getElementById('result');
                    resultDiv.style.display = 'block';
                    resultDiv.className = 'result ' + (data.is_spam ? 'spam' : 'ham');
                    resultDiv.innerHTML = `<strong>Result:</strong> This message is <strong>${data.result}</strong> (${data.confidence}% confidence)`;
                })
                .catch(error => {
                    console.error('Error:', error);
                    alert('An error occurred while checking the message');
                });
            }
        </script>
    </body>
    </html>
    """

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    message = data['message']

    # Preprocess the message
    processed_message = preprocess_text(message)

    # Vectorize the message
    message_vec = vectorizer.transform([processed_message])

    # Make prediction
    prediction = model.predict(message_vec)[0]
    proba = model.predict_proba(message_vec)[0]

    # Get confidence percentage
    confidence = round(max(proba) * 100, 2)

    # Return result
    result = {
        'is_spam': prediction == 'spam',
        'result': 'SPAM' if prediction == 'spam' else 'HAM (not spam)',
        'confidence': confidence,
        'original_message': message
    }

    return jsonify(result)

# Run the app in Colab
from google.colab.output import eval_js
print("The web app is running at:")
print(eval_js("google.colab.kernel.proxyPort(5000)"))
app.run(host='0.0.0.0', port=5000)

The web app is running at:
https://5000-m-s-we3iy3bx20ei-b.us-east1-0.prod.colab.dev
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [31/Mar/2025 15:45:11] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [31/Mar/2025 15:45:12] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [31/Mar/2025 15:45:24] "POST /predict HTTP/1.1" 200 -


In [5]:
from google.colab import files

# For individual files
files.download('spam_model.pkl')
files.download('vectorizer.pkl')

# Or zip all files
!zip -r project_files.zip .
files.download('project_files.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: .config/ (stored 0%)
  adding: .config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: .config/config_sentinel (stored 0%)
  adding: .config/default_configs.db (deflated 98%)
  adding: .config/logs/ (stored 0%)
  adding: .config/logs/2025.03.24/ (stored 0%)
  adding: .config/logs/2025.03.24/13.33.57.621594.log (deflated 86%)
  adding: .config/logs/2025.03.24/13.34.10.823237.log (deflated 57%)
  adding: .config/logs/2025.03.24/13.33.20.101499.log (deflated 92%)
  adding: .config/logs/2025.03.24/13.33.49.486436.log (deflated 58%)
  adding: .config/logs/2025.03.24/13.34.11.455236.log (deflated 56%)
  adding: .config/logs/2025.03.24/13.34.02.970189.log (deflated 58%)
  adding: .config/active_config (stored 0%)
  adding: .config/.last_opt_in_prompt.yaml (stored 0%)
  adding: .config/.last_survey_prompt.yaml (stored 0%)
  adding: .config/configurations/ (stored 0%)
  adding: .config/configurations/config_default (deflated 15%)
  adding: .conf

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>