<a href="https://colab.research.google.com/github/Nanditha09/CleanInboxAI/blob/main/Spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import joblib
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Rename columns for clarity
df = df.rename(columns={'v1': 'label', 'v2': 'message'})

# Drop all columns with names starting with 'Unnamed:'
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Add numerical labels for 'ham' and 'spam'
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', ' ', text, flags=re.MULTILINE)
    # Remove phone numbers
    text = re.sub(r'\b\d{10}\b', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', ' ', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatize words
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Apply preprocessing
df['clean_message'] = df['message'].apply(preprocess_text)

# Display the first few rows to verify
print(df.head())

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer to capture unigrams, bigrams, and trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# Fit and transform your cleaned messages
X_tfidf = vectorizer.fit_transform(df['clean_message'])

# Save the fitted vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

import re
import numpy as np
from scipy.sparse import hstack

# Detect presence of URLs
df['has_url'] = df['message'].apply(lambda x: int(bool(re.search(r'http[s]?://|www\.', x))))

# Detect presence of email addresses
df['has_email'] = df['message'].apply(lambda x: int(bool(re.search(r'\S+@\S+', x))))

# Detect presence of phone numbers (simple pattern)
df['has_phone'] = df['message'].apply(lambda x: int(bool(re.search(r'\b\d{10,}\b', x))))

# Combine TF-IDF features with the new binary features
X_additional = np.array(df[['has_url', 'has_email', 'has_phone']])
X_combined = hstack([X_tfidf, X_additional])


from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Prepare target variable
y = df['label_num']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
model = XGBClassifier(eval_metric='logloss')

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Save the trained model and vectorizer
joblib.dump(model, 'xgb_spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


  label                                            message  label_num  \
0   ham  Go until jurong point, crazy.. Available only ...          0   
1   ham                      Ok lar... Joking wif u oni...          0   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...          1   
3   ham  U dun say so early hor... U c already then say...          0   
4   ham  Nah I don't think he goes to usf, he lives aro...          0   

                                       clean_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts st ...  
3                u dun say early hor u c already say  
4                nah think go usf life around though  
Accuracy: 0.9811659192825112
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       965
        Spam       0.97      0.89      0.93       150

    accuracy                          

['vectorizer.pkl']

In [2]:
import streamlit as st
import joblib
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

# Load trained model and vectorizer
model = joblib.load('xgb_spam_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Preprocessing function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'http\S+|www\S+', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'\b\d{10,}\b', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Streamlit UI
st.title("📩 Spam Detection App")
st.write("Enter a message to classify it as Spam or Not Spam.")

# Input box
message = st.text_area("Your Message", "")

# Prediction
if st.button("Predict"):
    if message.strip() == "":
        st.warning("Please enter a message.")
    else:
        processed_message = preprocess_text(message)
        data_tfidf = vectorizer.transform([processed_message])

        # Detect presence of URLs, email addresses, and phone numbers
        has_url = int(bool(re.search(r'http[s]?://|www\.', message)))
        has_email = int(bool(re.search(r'\S+@\S+', message)))
        has_phone = int(bool(re.search(r'\b\d{10,}\b', message)))

        # Combine TF-IDF features with the new binary features
        additional_features = np.array([[has_url, has_email, has_phone]])
        data_combined = hstack([data_tfidf, additional_features])

        prediction = model.predict(data_combined)[0]
        if prediction == 1:
            st.error("🚨 This is SPAM!")
        else:
            st.success("✅ This is NOT spam.")


2025-04-17 20:28:54.126 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-04-17 20:28:54.138 Session state does not function when running a script without `streamlit run`


In [3]:
# Save the Streamlit app code to a file
app_code = '''import streamlit as st
import joblib
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

# Load trained model and vectorizer
model = joblib.load('xgb_spam_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Preprocessing function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'http\S+|www\S+', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'\b\d{10,}\b', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Streamlit UI
st.title("📩 Spam Detection App")
st.write("Enter a message to classify it as Spam or Not Spam.")

# Input box
message = st.text_area("Your Message", "")

# Prediction
if st.button("Predict"):
    if message.strip() == "":
        st.warning("Please enter a message.")
    else:
        processed_message = preprocess_text(message)
        data_tfidf = vectorizer.transform([processed_message])

        # Detect presence of URLs, email addresses, and phone numbers
        has_url = int(bool(re.search(r'http[s]?://|www\.', message)))
        has_email = int(bool(re.search(r'\S+@\S+', message)))
        has_phone = int(bool(re.search(r'\b\d{10,}\b', message)))

        # Combine TF-IDF features with the new binary features
        additional_features = np.array([[has_url, has_email, has_phone]])
        data_combined = hstack([data_tfidf, additional_features])

        prediction = model.predict(data_combined)[0]
        if prediction == 1:
            st.error("🚨 This is SPAM!")
        else:
            st.success("✅ This is NOT spam.")
'''

with open("app.py", "w") as file:
    file.write(app_code)

# Allow downloading the app
from google.colab import files
files.download("app.py")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
#pip freeze | sed 's/==.*$//' > requirements.txt
