# Spam Message classification using NLP

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re

Read the dataset

In [3]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

In [4]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data Pre-processing

Using stemming for pre-processing the data

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['message'] = df['message'].apply(preprocess_text)

In [8]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


Convert the target label to number

In [15]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [16]:
df['label'].head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64

Split the dataset into test and train

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42)

In [18]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

Convert data to vectors

In [19]:
pipeline.fit(X_train, y_train)

In [20]:
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.9826555023923444
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1453
        spam       0.95      0.91      0.93       219

    accuracy                           0.98      1672
   macro avg       0.97      0.95      0.96      1672
weighted avg       0.98      0.98      0.98      1672



# Custom text input

In [25]:
def predict_message(message):
    processed_message = preprocess_text(message)
    prediction = pipeline.predict([processed_message])
    return label_encoder.inverse_transform(prediction)[0]


custom_message = "Congratulations! You've won a free ticket to Bahamas. Text 'WIN' to 12345 to claim your prize."
prediction = predict_message(custom_message)
print(f"Custom Message: {custom_message}")
print(f"Prediction: {prediction}")


Custom Message: Congratulations! You've won a free ticket to Bahamas. Text 'WIN' to 12345 to claim your prize.
Prediction: spam
