# *SMS Spam Detection Using Naive Bayes and NLP*


In [1]:
import pandas as pd


In [5]:
df = pd.read_csv("spam.csv",encoding='latin-1')

In [9]:
df = df[["v1","v2"]]

In [17]:
df.columns = ['label', 'message']

In [19]:
print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [21]:
import string
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [23]:
nltk.download("punkt")
nltk.download("stopwords")
        

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saxen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words =word_tokenize(text)
    stop_words=set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]  # remove stopwords
    return ' '.join(words)

# Apply to message column
df['clean_message'] = df['message'].apply(preprocess_text)

# Preview result
df[['message', 'clean_message']].head()

Unnamed: 0,message,clean_message
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
vectorizer = TfidfVectorizer()

In [49]:
X = vectorizer.fit_transform(df['clean_message'])

In [51]:
y = df['label'].map({'ham': 0, 'spam': 1}).values

## ***naive_bayes***

In [53]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [55]:
X_train, X_test, y_train,y_test =train_test_split(X,y ,test_size=.2,random_state=42)

In [59]:
model = MultinomialNB()
model.fit(X_train,y_train)

In [61]:
pred = model.predict(X_test)

In [65]:
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))

Accuracy: 0.9659192825112107

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Confusion Matrix:
 [[965   0]
 [ 38 112]]


## ***logisticRegression***

In [74]:
from sklearn.linear_model import LogisticRegression

In [76]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [78]:
pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))

Accuracy: 0.9426008968609866

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.96      0.60      0.74       150

    accuracy                           0.94      1115
   macro avg       0.95      0.80      0.85      1115
weighted avg       0.94      0.94      0.94      1115


Confusion Matrix:
 [[961   4]
 [ 60  90]]


## ***RANDOMFOREST***

In [83]:
from sklearn.ensemble import RandomForestClassifier


In [85]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [87]:
pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))

Accuracy: 0.9721973094170404

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
 [[965   0]
 [ 31 119]]


## ***SVM***

In [94]:
from sklearn.svm import SVC

In [96]:
model = SVC()
model.fit(X_train, y_train)

In [98]:
pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))

Accuracy: 0.967713004484305

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.98      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.97      0.89      0.92      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
 [[963   2]
 [ 34 116]]


| Model              | Accuracy (%)         |
|--------------------|----------------------|
| Logistic Regression| 0.9426008968609866   |
| Random Forest      | 0.9721973094170404   |   
| SVM                | 0.967713004484305    |
| Naive Bayes        | 0.9659192825112107   |