<a href="https://colab.research.google.com/github/NallabothulaNithin/SMS-Spam-Detection/blob/main/spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload("spam.csv")


Saving spam.csv to spam.csv/spam.csv


In [None]:
import pandas as pd
data = pd.read_csv("spam.csv/spam.csv", encoding="latin-1")


In [None]:
data.head(), data.columns

(     v1                                                 v2 Unnamed: 2  \
 0   ham  Go until jurong point, crazy.. Available only ...        NaN   
 1   ham                      Ok lar... Joking wif u oni...        NaN   
 2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
 3   ham  U dun say so early hor... U c already then say...        NaN   
 4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   
 
   Unnamed: 3 Unnamed: 4  
 0        NaN        NaN  
 1        NaN        NaN  
 2        NaN        NaN  
 3        NaN        NaN  
 4        NaN        NaN  ,
 Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object'))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Keep only the needed columns
df = data[['v1', 'v2']]
df.columns = ['label', 'message']

In [None]:
# Encode labels (ham=0, spam=1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'],
                                                    test_size=0.2, random_state=42, stratify=df['label'])

In [None]:
# Convert text to TF-IDF features
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Initialize models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC()
}

In [None]:
# Train & evaluate
results = {}
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "report": classification_report(y_test, y_pred, output_dict=True)
    }

results

{'Naive Bayes': {'accuracy': 0.9721973094170404,
  'report': {'0': {'precision': 0.9698492462311558,
    'recall': 0.9989648033126294,
    'f1-score': 0.98419173890872,
    'support': 966.0},
   '1': {'precision': 0.9916666666666667,
    'recall': 0.7986577181208053,
    'f1-score': 0.8847583643122676,
    'support': 149.0},
   'accuracy': 0.9721973094170404,
   'macro avg': {'precision': 0.9807579564489113,
    'recall': 0.8988112607167174,
    'f1-score': 0.9344750516104938,
    'support': 1115.0},
   'weighted avg': {'precision': 0.9727647580202958,
    'recall': 0.9721973094170404,
    'f1-score': 0.9709042296577143,
    'support': 1115.0}}},
 'Logistic Regression': {'accuracy': 0.9704035874439462,
  'report': {'0': {'precision': 0.9669669669669669,
    'recall': 1.0,
    'f1-score': 0.983206106870229,
    'support': 966.0},
   '1': {'precision': 1.0,
    'recall': 0.7785234899328859,
    'f1-score': 0.8754716981132076,
    'support': 149.0},
   'accuracy': 0.9704035874439462,
   '

In [None]:
# Example custom messages
new_messages = [
    "Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!",
    "Hey, are we still meeting for lunch today?",
    "URGENT! Your mobile number has won $5000. Reply YES to claim."
]

In [None]:
# Transform using same TF-IDF vectorizer
new_tfidf = tfidf.transform(new_messages)


In [None]:
# Predict with SVM (best model)
predictions = models["SVM"].predict(new_tfidf)

In [None]:
# Map back to labels
labels = ["ham" if p == 0 else "spam" for p in predictions]

for msg, label in zip(new_messages, labels):
    print(f"Message: {msg}\nPrediction: {label}\n")

Message: Congratulations! You have won a $1000 Walmart gift card. Click here to claim now!
Prediction: spam

Message: Hey, are we still meeting for lunch today?
Prediction: ham

Message: URGENT! Your mobile number has won $5000. Reply YES to claim.
Prediction: spam



In [None]:
# Function to predict if a message is spam or ham
def predict_message(message, model=models["SVM"], vectorizer=tfidf):
    """
    Predict whether a given SMS message is spam or ham.
    Args:
        message (str): The SMS text
        model: Trained classifier (default = best SVM model)
        vectorizer: Trained TF-IDF vectorizer
    Returns:
        str: 'spam' or 'ham'
    """
    msg_tfidf = vectorizer.transform([message])
    prediction = model.predict(msg_tfidf)[0]
    return "spam" if prediction == 1 else "ham"


# ✅ Test the function
print(predict_message("ZUDIO SALE MADNESS - last 3 days! B1G1 FREE for sale fits @199 or less! RUN NOW! Peep the fresh arrivals: https://shrtsms.in/ZUDIOO/bmHPfR *T&C"))
print(predict_message("Hey bro, are you coming to the party?"))
print(predict_message("B1G1 at WESTSIDE SALE - 2 days left! All sale items under Rs 499! In-store only. Don't miss out! *T&C"))


ham
ham
ham
