# Spam SMS Detection

## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

## Load dataset


In [2]:
data = pd.read_csv('/content/drive/MyDrive/Codsoft dataset/Spam SMS Detection/spam.csv', encoding='ISO-8859-1')

## Selecting only relevant columns


In [3]:
data = data.iloc[:, :2]
data.columns = ['Category', 'Message']

## Convert labels to binary values


In [4]:
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})
print("Dataset Columns:", data.columns)

Dataset Columns: Index(['Category', 'Message'], dtype='object')


## Function for text preprocessing


In [5]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

## Apply text cleaning


In [6]:
data['Message'] = data['Message'].apply(clean_text)

## Convert text to numerical representation


In [7]:
text_vectorizer = TfidfVectorizer()
X_features = text_vectorizer.fit_transform(data['Message'])
y_labels = data['Category']

## Splitting data into training and testing sets


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=42)

## Initializing multiple classification models


In [9]:
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(kernel='linear')
}

## Selecting the best performing model


In [10]:
optimal_model = None
highest_accuracy = 0

In [11]:
for model_name, model in classifiers.items():
    model.fit(X_train, y_train)  # Train model
    predictions = model.predict(X_test)  # Make predictions
    model_accuracy = accuracy_score(y_test, predictions)  # Evaluate accuracy
    print(f"{model_name} Accuracy: {model_accuracy:.4f}")

    if model_accuracy > highest_accuracy:
        highest_accuracy = model_accuracy
        optimal_model = model

print(f"Optimal Model: {optimal_model} with Accuracy: {highest_accuracy:.4f}")

Multinomial Naive Bayes Accuracy: 0.9507
Logistic Regression Accuracy: 0.9605
Support Vector Machine Accuracy: 0.9785
Optimal Model: SVC(kernel='linear') with Accuracy: 0.9785


## Function to classify a given SMS message


In [12]:
def classify_sms(message):
    processed_msg = clean_text(message)  # Clean the input message
    msg_vector = text_vectorizer.transform([processed_msg])  # Convert text to vector
    result = optimal_model.predict(msg_vector)  # Make prediction
    return "Spam" if result[0] == 1 else "Ham"

## User input for SMS classification


In [16]:
user_sms = input("Enter a message to classify: ")
print("Prediction:", classify_sms(user_sms))

Enter a message to classify: Congratulations! You have won a free iPhone. Click here to claim your prize: www.fakeoffer.com
Prediction: Spam
