In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("spam.csv", encoding='latin-1')

# Drop unnecessary columns and rename
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

# Check the shape and preview
print("Dataset shape:", df.shape)
print(df.head())


Dataset shape: (5572, 2)
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download NLTK assets
nltk.download('stopwords')

# Set up tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply to text column
df['clean_text'] = df['text'].apply(clean_text)

print(df[['text', 'clean_text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                          clean_text  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri 2 wkli comp win fa cup final tkt 21...  
3                u dun say earli hor u c alreadi say  
4          nah dont think goe usf live around though  


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=3000)

# Transform the cleaned text
X = vectorizer.fit_transform(df['clean_text']).toarray()

# Convert labels to binary
y = df['label'].map({'ham': 0, 'spam': 1}).values

print("Feature matrix shape:", X.shape)


Feature matrix shape: (5572, 3000)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.97847533632287

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
 [[965   0]
 [ 24 126]]


In [5]:
def predict_message(msg):
    # Clean the input
    cleaned = clean_text(msg)
    # Vectorize
    vectorized = vectorizer.transform([cleaned]).toarray()
    # Predict
    prediction = model.predict(vectorized)[0]
    label = "Spam" if prediction == 1 else "Ham"
    return label

# Test it
print(predict_message("Congratulations! You won a free iPhone. Click here to claim."))
print(predict_message("Hey, are we still on for lunch today?"))


Spam
Ham


In [9]:
# Label mapping: 1 = Spam, 0 = Ham
test_cases_normal = [
    ("Hey, are you coming to the meeting tomorrow?", 0),
    ("Your OTP for login is 928374", 0),
    ("Congratulations! You’ve been selected for a $1000 Walmart gift card.", 1),
    ("Can we reschedule our call to next week?", 0),
    ("Claim your free vacation now by clicking this link!", 1),
    ("Don't forget to bring your ID for the interview.", 0),
    ("URGENT! You have won a lottery. Text WIN to 90999 now!", 1),
    ("Dinner at 7 pm? Let me know.", 0),
    ("This is not spam. We’re just checking your eligibility for insurance.", 1),
    ("Free entry in 2 a weekly competition to win an iPhone. Just text WIN!", 1),
    ("Did you finish the assignment due today?", 0),
    ("You've been pre-approved for a credit card. Apply now!", 1),
    ("Let’s go hiking this weekend!", 0),
    ("Earn $500 a day working from home. No experience needed.", 1),
    ("Hey, I’ll be late. Stuck in traffic.", 0),
    ("Your parcel has been dispatched. Track it here.", 0),
    ("Get cheap meds online without prescription. Visit now!", 1),
    ("See you at the game tonight!", 0),
    ("Winner! You’ve been selected for a $500 Amazon voucher.", 1),
    ("I left your jacket at the office. Grab it tomorrow.", 0),
]

test_cases_tricky = [
    ("You’re a winner. Psych! Just kidding, it’s me.", 0),
    ("Free coffee if you buy five this month. Terms apply.", 0),
    ("This is a business opportunity but not a scam.", 1),
    ("Yo! I found a crazy way to earn extra cash (not a joke).", 1),
    ("Meeting got moved. Also, check out this crypto project!", 1),
    ("Hey! Click this link... just memes, nothing shady lol.", 1),
    ("Not spam. You’ve actually won something (probably).", 1),
    ("Forward this to 5 people and good luck will follow 🍀", 1),
    ("Get paid to read emails! Just kidding... unless?", 1),
    ("Your phone number was used in a recent transaction. Was this you?", 0)
]

# Combine all test cases
all_tests = test_cases_normal + test_cases_tricky

# Predict and compare
for msg, expected in all_tests:
    predicted = 1 if predict_message(msg) == "Spam" else 0
    label = "✅ Correct" if predicted == expected else "❌ Wrong"
    print(f"Message: {msg[:60]:<60} | Expected: {expected} | Predicted: {predicted} | {label}")


Message: Hey, are you coming to the meeting tomorrow?                 | Expected: 0 | Predicted: 0 | ✅ Correct
Message: Your OTP for login is 928374                                 | Expected: 0 | Predicted: 0 | ✅ Correct
Message: Congratulations! You’ve been selected for a $1000 Walmart gi | Expected: 1 | Predicted: 1 | ✅ Correct
Message: Can we reschedule our call to next week?                     | Expected: 0 | Predicted: 0 | ✅ Correct
Message: Claim your free vacation now by clicking this link!          | Expected: 1 | Predicted: 1 | ✅ Correct
Message: Don't forget to bring your ID for the interview.             | Expected: 0 | Predicted: 0 | ✅ Correct
Message: URGENT! You have won a lottery. Text WIN to 90999 now!       | Expected: 1 | Predicted: 1 | ✅ Correct
Message: Dinner at 7 pm? Let me know.                                 | Expected: 0 | Predicted: 0 | ✅ Correct
Message: This is not spam. We’re just checking your eligibility for i | Expected: 1 | Predicted: 0 | ❌ Wrong
Mes

In [32]:
import pickle

# Assuming your trained model and vectorizer are named `model` and `vectorizer`
with open("spam_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("Saved model and vectorizer successfully.")


Saved model and vectorizer successfully.
