In [None]:
# ✅ NLP Spam Detection – Important Best Practices 💡
import pandas as pd
import os
import zipfile
import urllib.request

# Download and extract the dataset if not already present
if not os.path.exists('SMSSpamCollection'):
    zip_path = 'smsspamcollection.zip'
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall()

# Load the data
messages = pd.read_csv('SMSSpamCollection',
                       sep='\t',
                       names=["label", "message"])
messages.head()
# | Index | label    | message                                           |
# | ----- | -------- | ------------------------------------------------- |
# | 0     | **ham**  | Go until jurong point, crazy.. Available only ... |
# | 1     | **ham**  | Ok lar... Joking wif u oni...                     |
# | 2     | **spam** | Free entry in 2 a wkly comp to win FA Cup fina... |
# | 3     | **ham**  | U dun say so early hor... U c already then say... |
# | 4     | **ham**  | Nah I don't think he goes to usf, he lives aro... |

# 🧹 Step 1: Preprocessing – Clean the Text
# Install and import missing modules
%pip install nltk --quiet

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

corpus = []
ps = PorterStemmer()

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
# ✔️ Cleaned texts are stored in corpus.

# 🔀 Step 2: Train-Test Split (🚨 Important)
from sklearn.model_selection import train_test_split

# 📌 y is already encoded as binary (True for ham, False for spam)
y = pd.get_dummies(messages['label'])['ham'].values
y
# array([ True,  True, False, ...,  True,  True,  True], shape=(5572,))

# 🔀 Split the corpus, not the vectorized data
X_train_raw, X_test_raw, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=42)

# 🧾 Step 3: Vectorization – Bag of Words ✅
from sklearn.feature_extraction.text import CountVectorizer
# ngram_range means we consider unigrams and bigrams
cv = CountVectorizer(max_features=2500, ngram_range=(1, 2))

# ✅ Fit on training data only
X_train = cv.fit_transform(X_train_raw).toarray()

# ✅ Transform on test data (not fit_transform!)
X_test = cv.transform(X_test_raw).toarray()
# 🧠 fit_transform() builds vocabulary and applies it to training set
# 🧠 transform() applies that same vocab to test set
# 🧠 Step 4: Train Model and Predict 📈
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# ✔️ Accuracy: ~98%
# ✔️ Good precision/recall, no data leakage!
# 🔁 Repeat with TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000, stop_words='english', lowercase=True, ngram_range=(1, 2))

# ✅ Fit on training only
X_train_tfidf = tfidf.fit_transform(X_train_raw).toarray()

# ✅ Transform test using same vocab
X_test_tfidf = tfidf.transform(X_test_raw).toarray()
# Train + Predict with TF-IDF
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred_tfidf = model.predict(X_test_tfidf)
print("Accuracy (TF-IDF):", accuracy_score(y_test, y_pred_tfidf))
print(classification_report(y_test, y_pred_tfidf))

# Accuracy: 0.9838565022421525
#               precision    recall  f1-score   support

#        False       0.95      0.93      0.94       149
#         True       0.99      0.99      0.99       966

#     accuracy                           0.98      1115
#    macro avg       0.97      0.96      0.96      1115
# weighted avg       0.98      0.98      0.98      1115

# Accuracy (TF-IDF): 0.97847533632287
#               precision    recall  f1-score   support

#        False       0.96      0.87      0.92       149
#         True       0.98      0.99      0.99       966

#     accuracy                           0.98      1115
#    macro avg       0.97      0.93      0.95      1115
# weighted avg       0.98      0.98      0.98      1115




Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9838565022421525
              precision    recall  f1-score   support

       False       0.95      0.93      0.94       149
        True       0.99      0.99      0.99       966

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy (TF-IDF): 0.97847533632287
              precision    recall  f1-score   support

       False       0.96      0.87      0.92       149
        True       0.98      0.99      0.99       966

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

