In [1]:
import pandas as pd
import joblib
import os
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Download NLTK resources
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load the dataset with explicit encoding
df = pd.read_csv(r"C:\Users\Dell\Desktop\Resume Projects\SMS Spam Detection\spam.csv", encoding='latin1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:

# Drop unnecessary columns if present
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], errors='ignore')

# Rename columns to a standard format
df.columns = ['label', 'message']

# Encode the labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing function
def preprocess_text(text):
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [ps.stem(word) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Apply text preprocessing
df['message'] = df['message'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)


In [4]:
# Create a pipeline with TfidfVectorizer and SVC
svc_pipeline = make_pipeline(TfidfVectorizer(), SVC(probability=True))

# Train the SVC model
svc_pipeline.fit(X_train, y_train)

# Evaluate the SVC model
svc_y_pred = svc_pipeline.predict(X_test)
svc_accuracy = accuracy_score(y_test, svc_y_pred)
print(f'SVC Accuracy: {svc_accuracy}')
print(classification_report(y_test, svc_y_pred))

SVC Accuracy: 0.9775784753363229
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.85      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [5]:
# Create a pipeline with TfidfVectorizer and RandomForest
rf_pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(random_state=42))

# Train the RandomForest model
rf_pipeline.fit(X_train, y_train)

# Evaluate the RandomForest model
rf_y_pred = rf_pipeline.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print(f'Random Forest Accuracy: {rf_accuracy}')
print(classification_report(y_test, rf_y_pred))


Random Forest Accuracy: 0.97847533632287
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [6]:
# Select the best model based on accuracy
if svc_accuracy > rf_accuracy:
    best_model = svc_pipeline
    print("SVC model selected.")
else:
    best_model = rf_pipeline
    print("Random Forest model selected.")

# Save the best model
joblib.dump(best_model, 'best_sms_spam_classifier.pkl')
print("Best model saved successfully.")


Random Forest model selected.
Best model saved successfully.


In [7]:
# Load the best trained model
model = joblib.load('best_sms_spam_classifier.pkl')

# Prompt the user for an SMS message
sms_message = input("Enter an SMS message: \n")

# Preprocess the input message
preprocessed_message = preprocess_text(sms_message)

# Predict using the model
prediction = model.predict([preprocessed_message])[0]
prediction_proba = model.predict_proba([preprocessed_message])[0]

# Display the result with bold heading and result on the next line

from IPython.display import display, HTML

display(HTML('<b>Prediction:</b>'))

if prediction == 1:
    print(f'This message is classified as Spam\nProbability: {prediction_proba[1]:.2f}')
else:
    print(f'\nThis message is classified as Not Spam\nProbability: {prediction_proba[0]:.2f}')


Enter an SMS message: 
heey wud



This message is classified as Not Spam
Probability: 1.00
