# SMS Spam Detection Using SVM.

In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('/content/spam.csv')
df.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
encoder = LabelEncoder()
df['Label'] = encoder.fit_transform(df['Label'])

In [4]:
df

Unnamed: 0,Label,EmailText
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ã_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [5]:
# removing all the duplicate values and keeping only the first
df = df.drop_duplicates(keep='first')

In [6]:
df.shape

(5169, 2)

In [7]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [8]:
def get_importantFeatures(sent):
    sent = sent.lower()

    returnList = []
    sent = nltk.word_tokenize(sent)
    for i in sent:
        if i.isalnum():
            returnList.append(i)
    return returnList

def removing_stopWords(sent):
    returnList = []
    # Explicitly load stopwords within the function
    stop_words = set(nltk.corpus.stopwords.words('english'))
    for i in sent:
        if i not in stop_words and i not in string.punctuation:
            returnList.append(i)
    return returnList

def potter_stem(sent):
    returnList = []
    for i in sent:
        returnList.append(ps.stem(i))
    return " ".join(returnList)

In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added this line to download the necessary resource

df.loc[:, 'imp_feature'] = df['EmailText'].apply(get_importantFeatures)
df.loc[:, 'imp_feature'] = df['imp_feature'].apply(removing_stopWords)
df.loc[:, 'imp_feature'] = df['imp_feature'].apply(potter_stem)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'imp_feature'] = df['EmailText'].apply(get_importantFeatures)


In [11]:
df

Unnamed: 0,Label,EmailText,imp_feature
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u pound prize 2 claim e...
5568,0,Will Ã_ b going to esplanade fr home?,b go esplanad fr home
5569,0,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5570,0,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...


In [12]:
from sklearn.model_selection import train_test_split
X = df['imp_feature']
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf = TfidfVectorizer()
feature = tfidf.fit_transform(X_train)

tuned_parameters = {'kernel':['linear','rbf'],'gamma':[1e-3,1e-4], 'C':[1,10,100,1000]}

model = GridSearchCV(svm.SVC(),tuned_parameters)
model.fit(feature, y_train)

In [16]:
y_predict = tfidf.transform(X_test)
print("Accuracy:",model.score(y_predict,y_test))

Accuracy: 0.9814385150812065


In [17]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [18]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
import pickle

# Load the saved model
spam_model = pickle.load(open("finalized_model.sav",'rb'))

# Assuming tfidf is already defined from previous cells
# If not, you would need to load or recreate the TfidfVectorizer here
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# You might need to fit it on your training data again if it's not in the kernel

def check_spam_colab():
    text = input("Enter your text: ")
    # Preprocess the input text using the same functions as before
    processed_text = potter_stem(removing_stopWords(get_importantFeatures(text)))
    is_spam = spam_model.predict(tfidf.transform([processed_text]))

    if is_spam == 1:
        print("🛡️ Threat detected:Text is spam_ This is likely a spam message  .")
    else:
         print("🛡️ Secure: Text is not spam_This message passed the spam filter.")

# You can call the function to test it:
check_spam_colab()

Enter your text: hello
🛡️ Secure: Text is not spam_This message passed the spam filter.


In [20]:
import pickle
import joblib

# Save the model using pickle as it was done before
pickle.dump(model, open('finalized_model.sav', 'wb'))

# Save the tfidf vectorizer using pickle
pickle.dump(tfidf, open('tfidf_vectorizer.pkl', 'wb'))

In [21]:
import pickle
pickle.dump(model, open("finalized_model.sav", "wb"))
pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))


In [22]:
# Install Gradio and nltk stopwords
!pip install gradio --quiet

import gradio as gr
import pickle
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# Load your saved model and vectorizer (assumes you saved them earlier)
model = pickle.load(open("finalized_model.sav", "rb"))
tfidf = pickle.load(open("tfidf_vectorizer.pkl", "rb"))


In [24]:
# Text cleaning, stopword removal, and stemming
def get_importantFeatures(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def removing_stopWords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def potter_stem(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])


In [26]:
!pip install gradio --quiet

import pandas as pd
import gradio as gr
import string
import re
import pickle
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
ps = PorterStemmer()

def get_importantFeatures(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def removing_stopWords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def potter_stem(text):
    return ' '.join([ps.stem(word) for word in text.split()])

def preprocess(text):
    return potter_stem(removing_stopWords(get_importantFeatures(text)))


In [32]:
df = pd.read_csv("/content/spam.csv", encoding='latin-1')[['Label', 'EmailText']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

df['processed_text'] = df['text'].apply(preprocess)

X = df['processed_text']
y = df['label']

tfidf = TfidfVectorizer()
X_vec = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

model = LinearSVC()
model.fit(X_train, y_train)

# Save model and vectorizer
pickle.dump(model, open("finalized_model.sav", "wb"))
pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))

In [33]:
# Load model and vectorizer
model = pickle.load(open("finalized_model.sav", "rb"))
tfidf = pickle.load(open("tfidf_vectorizer.pkl", "rb"))

def classify_sms(message):
    # Preprocess input message using the original functions
    processed = potter_stem(removing_stopWords(get_importantFeatures(message)))
    vec_input = tfidf.transform([processed])
    prediction = model.predict(vec_input)[0]

    return "🚫 Spam" if prediction == 1 else "✅ Not Spam"

# Gradio UI
interface = gr.Interface(
    fn=classify_sms,
    inputs=gr.Textbox(lines=4, placeholder="Enter SMS or Email message here..."),
    outputs="text",
    title="📨 SMS Spam Detection",
    description="Enter any message to check if it's spam or not using your trained SVM model."
)

interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://869bb03536eda50854.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [38]:
import pickle
import gradio as gr
import re
import string
import nltk
# No need to download stopwords here if already downloaded earlier
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Assuming get_importantFeatures, removing_stopWords, and potter_stem
# are defined in an earlier cell and are correct.
# If not, they should be defined here or in an imported module.

# Load model and vectorizer inside the function for testing purposes
# This is inefficient but helps diagnose if the loaded objects are the issue
# You might want to move this outside the function for performance later
try:
    model = pickle.load(open("finalized_model.sav", "rb"))
    tfidf = pickle.load(open("tfidf_vectorizer.pkl", "rb"))
except FileNotFoundError:
    print("Model or vectorizer file not found. Please ensure 'finalized_model.sav' and 'tfidf_vectorizer.pkl' exist.")
    model = None
    tfidf = None


def classify_sms(message):
    if model is None or tfidf is None:
        return "Error: Model or vectorizer not loaded."

    # Preprocess input message using the original functions
    processed = potter_stem(removing_stopWords(get_importantFeatures(message)))

    # Add print statements to debug in Colab console
    print(f"Input Message: {message[:50]}...") # Print first 50 chars
    print(f"Preprocessed Message: {processed[:50]}...") # Print first 50 chars


    vector_input = tfidf.transform([processed])

    prediction = model.predict(vector_input)[0]

    print(f"Model Prediction: {prediction}") # Print raw prediction


    # Show prediction
    return "🚫 Spam" if prediction == 1 else "✅ Not Spam" # Assuming 1 is spam and 0 is not spam based on LabelEncoder

# Create the GUI
interface = gr.Interface(
    fn=classify_sms,
    inputs=gr.Textbox(lines=4, placeholder="Enter SMS or Email message here..."),
    outputs="text",
    title="📨 SMS Spam Detection",
    description="Enter any message to check if it's spam or not using your SVM model."
)

# Launch the app with public link
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d6a7ac73a1a4726eb4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [36]:
# User provided spam text
user_spam_text = """Congratulations! You've been selected for a high-paying work-from-home job at Amazon India. Earn ₹10,000/day easily!

Pay ₹599 now to confirm your slot. Offer valid for 24 hours only!

Click to apply: http://amzn-work-now.in
Contact HR via WhatsApp: +91-98765XXXXX

Regards,
Sneha HR Team
"""

# Preprocess the text using the original functions
processed_user_spam_text = potter_stem(removing_stopWords(get_importantFeatures(user_spam_text)))
print("Original Text:")
print(user_spam_text)
print("\nPreprocessed Text:")
print(processed_user_spam_text)

# Transform the preprocessed text using the loaded tfidf vectorizer
# Assuming tfidf is loaded from 'tfidf_vectorizer.pkl' in a previous cell
# If not, make sure to load it first:
# import pickle
# tfidf = pickle.load(open("tfidf_vectorizer.pkl", "rb"))
vectorized_user_spam_text = tfidf.transform([processed_user_spam_text])
print("\nVectorized Text Shape:", vectorized_user_spam_text.shape)

# Make a prediction using the loaded model
# Assuming model is loaded from 'finalized_model.sav' in a previous cell
# If not, make sure to load it first:
# import pickle
# model = pickle.load(open("finalized_model.sav", "rb"))
prediction = model.predict(vectorized_user_spam_text)[0]
print("\nModel Prediction (0=Not Spam, 1=Spam):", prediction)

# Interpret the prediction
result = "🚫 Spam" if prediction == 1 else "✅ Not Spam"
print("\nClassification Result:", result)

Original Text:
Congratulations! You've been selected for a high-paying work-from-home job at Amazon India. Earn ₹10,000/day easily!

Pay ₹599 now to confirm your slot. Offer valid for 24 hours only!

Click to apply: http://amzn-work-now.in
Contact HR via WhatsApp: +91-98765XXXXX

Regards,
Sneha HR Team


Preprocessed Text:
congratul youv select highpay workfromhom job amazon india earn ₹day easili pay ₹ confirm slot offer valid hour click appli contact hr via whatsapp xxxxx regard sneha hr team

Vectorized Text Shape: (1, 7038)

Model Prediction (0=Not Spam, 1=Spam): 0

Classification Result: ✅ Not Spam


In [42]:
import pickle
import gradio as gr
import re
import string
import nltk
# Assuming these are already downloaded based on previous cells
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Assuming get_importantFeatures, removing_stopWords, and potter_stem
# are defined in an earlier cell (like cell_id: d66a2f89)

# Load model and vectorizer
try:
    model = pickle.load(open("finalized_model.sav", "rb"))
    tfidf = pickle.load(open("tfidf_vectorizer.pkl", "rb"))
    print("Model and vectorizer loaded successfully.")
except FileNotFoundError:
    print("Error: Model or vectorizer file not found. Please ensure 'finalized_model.sav' and 'tfidf_vectorizer.pkl' exist.")
    model = None
    tfidf = None
except Exception as e:
    print(f"An error occurred while loading model/vectorizer: {e}")
    model = None
    tfidf = None


def classify_sms(message):
    if model is None or tfidf is None:
        return "Error: Model or vectorizer not loaded. Please run the training cell first."

    try:
        # Preprocess input message using the original functions
        # Assuming get_importantFeatures, removing_stopWords, potter_stem are accessible
        processed = potter_stem(removing_stopWords(get_importantFeatures(message)))

        # Add print statements for debugging
        print(f"Input Message (first 50 chars): {message[:50]}...")
        print(f"Preprocessed Message (first 50 chars): {processed[:50]}...")


        vector_input = tfidf.transform([processed])
        prediction = model.predict(vector_input)[0]

        # Add print statement for debugging the prediction
        print(f"Model Prediction: {prediction}")

        # Return classification result
        return "🚫 Spam" if prediction == 1 else "✅ Not Spam" # Assuming 1 is spam and 0 is not spam
    except Exception as e:
        return f"An error occurred during classification: {e}"


# Create the GUI
if model is not None and tfidf is not None:
    interface = gr.Interface(
        fn=classify_sms,
        inputs=gr.Textbox(lines=4, placeholder="Enter SMS or Email message here...", label="Enter Message"),
        outputs=gr.Textbox(label="Classification Result"),
        title="📨 SMS Spam Detection",
        description="Enter any message to check if it's spam or not using your trained SVM model."
    )

    # Launch the app with public link
    interface.launch(share=True)
else:
    print("Gradio interface not launched because model or vectorizer could not be loaded.")

Model and vectorizer loaded successfully.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8d7c1d8665c2bb9790.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


**END**