# **Email/sms Spam Detection Model**

#### Importing Important Libraries

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


## **Loading DataSet**

### Data Preprocessing

In [None]:
dataset_dict = load_dataset('sms_spam')
df2 = pd.read_csv(r'/content/spam_email_dataset.csv',encoding='latin1')

In [None]:
df = pd.DataFrame(dataset_dict['train'])

In [None]:
df.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
df.shape

(5574, 2)

In [None]:
df2['sms'] = df2['Subject'] + " " + df2['Body']

In [None]:
df2 = df2[['sms','Spam Label']]

In [None]:
df2['label'] = df2['Spam Label']
df2 = df2[['sms','label']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['label'] = df2['Spam Label']


In [None]:
df2.head()

Unnamed: 0,sms,label
0,Exclusive Loan Offer  Get Instant Approval Al...,1
1,"Meeting Agenda for Tomorrow Dear Student,\n\nT...",0
2,"Quarterly Sales Report Attached Hey,\n\nIm sh...",0
3,Exclusive Loan Offer  Get Instant Approval De...,1
4,"Win $1,000,000 Now  Limited Time Offer! Dear ...",1


In [None]:
df = pd.concat([df,df2],ignore_index = True)

In [None]:
df.shape

(7074, 2)

In [None]:
df.isnull().sum()

Unnamed: 0,0
sms,0
label,0


In [None]:
df['sms'] = df['sms'].apply(lambda x : x.lower())

In [None]:
import string
def remove_punct(txt):
  return txt.translate(str.maketrans('','',string.punctuation))
df['sms'] = df['sms'].apply(remove_punct)

In [None]:
def remove_emojis(txt):
  new = ''
  for i in txt:
    if i.isascii():
      new += i
  return new
df['sms'] = df['sms'].apply(remove_emojis)

In [None]:
df.head()

Unnamed: 0,sms,label
0,go until jurong point crazy available only in ...,0
1,ok lar joking wif u oni\n,0
2,free entry in 2 a wkly comp to win fa cup fina...,1
3,u dun say so early hor u c already then say\n,0
4,nah i dont think he goes to usf he lives aroun...,0


## **Model Building**

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))

### Removing Stopwords

In [None]:
def remove_stopwords(txt):
  words = txt.split()
  cleaned = []
  for i in words:
    if not i in stop_words:
      cleaned.append(i)

  return ' '.join(cleaned)
df['sms'] = df['sms'].apply(remove_stopwords)


In [None]:
df.head()

Unnamed: 0,sms,label
0,go jurong point crazy available bugis n great ...,0
1,ok lar joking wif u oni,0
2,free entry 2 wkly comp win fa cup final tkts 2...,1
3,u dun say early hor u c already say,0
4,nah dont think goes usf lives around though,0


### Data Seperating Into X and y Variable

In [None]:
X = df['sms']
y = df['label']

### Spliting Data Into Train Set and Test Set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_vectorizer = CountVectorizer()

In [None]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

## **Naive Byes Model**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
nb_model = MultinomialNB()

### Naive Byes Model Train And Testing

In [None]:
nb_model.fit(X_train_bow,y_train)

In [None]:
pred_np = nb_model.predict(X_test_bow)

In [None]:
print('accuracy = ',accuracy_score(y_test,pred_np))

accuracy =  0.9851590106007068


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, pred_np))
print(classification_report(y_test, pred_np))


[[1072    7]
 [  14  322]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1079
           1       0.98      0.96      0.97       336

    accuracy                           0.99      1415
   macro avg       0.98      0.98      0.98      1415
weighted avg       0.99      0.99      0.99      1415



In [None]:
def predict_message(msg):
    msg_vec = bow_vectorizer.transform([msg])
    prediction = nb_model.predict(msg_vec)[0]
    prob = nb_model.predict_proba(msg_vec)[0][1]
    prob = nb_model.predict_proba(msg_vec)[0][1]
    label = "SPAM" if prob >= 0.4 else "HAM"



    print(f"\n📩 Message: {msg}")
    print(f"🔍 Prediction: {label}")
    print(f"📊 Spam Probability: {prob:.2f}")


In [None]:
test_messages = [
    "Your loan has been approved. No documents required! Apply now.",
    "Thanks for your help on the project yesterday, really appreciate it.",
    "You’ve won a free holiday! Call now to claim your prize.",
    "Can we reschedule our appointment to Friday afternoon?",
    "Claim your exclusive deal now — limited time only!",
    "Please review the latest update in the shared folder before our meeting.",
    "Congratulations! You've been selected for a chance to win ₹1,00,000.",
    "I'll send you the invoice by evening, let me know if you need anything else.",
    "Verify your email address now to avoid service interruption.",
    "Do you have the updated syllabus for this semester's exam?"
    "Congratulations! You have won a free lottery ticket."
    "Hi bro, kal milte hain college me."
]

for msg in test_messages:
    predict_message(msg)



📩 Message: Your loan has been approved. No documents required! Apply now.
🔍 Prediction: SPAM
📊 Spam Probability: 1.00

📩 Message: Thanks for your help on the project yesterday, really appreciate it.
🔍 Prediction: HAM
📊 Spam Probability: 0.00

📩 Message: You’ve won a free holiday! Call now to claim your prize.
🔍 Prediction: SPAM
📊 Spam Probability: 1.00

📩 Message: Can we reschedule our appointment to Friday afternoon?
🔍 Prediction: HAM
📊 Spam Probability: 0.00

📩 Message: Claim your exclusive deal now — limited time only!
🔍 Prediction: SPAM
📊 Spam Probability: 1.00

📩 Message: Please review the latest update in the shared folder before our meeting.
🔍 Prediction: HAM
📊 Spam Probability: 0.03

📩 Message: Congratulations! You've been selected for a chance to win ₹1,00,000.
🔍 Prediction: SPAM
📊 Spam Probability: 1.00

📩 Message: I'll send you the invoice by evening, let me know if you need anything else.
🔍 Prediction: HAM
📊 Spam Probability: 0.00

📩 Message: Verify your email address now 

## Dumping Model and Vectorizer into Pickle File

In [None]:
import joblib

joblib.dump(nb_model,'email_spam_detection_model.pkl')
joblib.dump(bow_vectorizer,'vectorizer.pkl')

['vectorizer.pkl']