In [106]:
# we'll import all the tools we need at the very beginning.
import pandas as pd 
# for making our visualization
import matplotlib.pyplot as plt
import seaborn as sns 
# for splitting data
from sklearn.model_selection import train_test_split 
# for turning text into number
from sklearn.feature_extraction.text import TfidfVectorizer 
# for fixing our class imbalance 
from imblearn.over_sampling import SMOTE 
# we will test two diffrent models
from sklearn.naive_bayes import MultinomialNB # A classic, effective model for text
from sklearn.linear_model import LogisticRegression # A powerful and common classifier
# evaluating our model
from sklearn.metrics import classification_report, confusion_matrix, f1_score 

In [107]:
# load and clean the dataset

In [108]:
# load the dataset
# we use encoding = 'latin-1' cause this dataset contain special charecter
try:
    df = pd.read_csv("spam.csv", encoding='latin-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Make sure it's in the same folder.")
  

# keeping only the two column which we need 'v1' (label) and 'v2' (message)
df = df[['v1', 'v2']]

# Rename the columns to something easy to understand
df.columns = ['label', 'message']

print("--- Data Head (After Cleaning) ---")
print(df.head())
print("\n")

--- Data Head (After Cleaning) ---
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...




In [109]:
# explore and prepare the data

In [110]:
# checking for missing value
print("Missing Value Check")
print(df.isnull().sum())
print("\n")

# 2. Checking the class balance (this is "imbalance" problem)
print("Class Balance")
print(df['label'].value_counts())
print("\n")

# 3. Creating a new column label_num
# machine learning models need numbers not text ('ham' or 'spam').
# we will map 'ham' to 0 and 'spam' to 1.
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# this is our new column
print("--- Data with 'label_num' (0=ham, 1=spam) ---")
print(df.head(5))
print("\n")

Missing Value Check
label      0
message    0
dtype: int64


Class Balance
label
ham     4825
spam     747
Name: count, dtype: int64


--- Data with 'label_num' (0=ham, 1=spam) ---
  label                                            message  label_num
0   ham  Go until jurong point, crazy.. Available only ...          0
1   ham                      Ok lar... Joking wif u oni...          0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...          1
3   ham  U dun say so early hor... U c already then say...          0
4   ham  Nah I don't think he goes to usf, he lives aro...          0




In [111]:
# Augment the data set 

print("--- Augmenting Data with Expanded List ---")
print("Original shape:", df.shape)
print("Original value counts:\n", df['label'].value_counts())
print("\n")

# 1. Create a large, diverse list of new scam messages
new_spam_messages = [
    # Bank / Money Scams
    "Hey, my money accidently transfeered on ur bank",
    "this is bank wala your account frozen",
    "URGENT wrong transfer send back plz",
    "accidently sent money to you, plz return",
    "u have recvd money from bank wala",
    "Your bank account has been locked. Please verify immediately.",
    "A large payment was just sent from your acct. If not you, call now.",
    "Unusual activity detected on your bank account. Secure it now.",
    
    # login \ password scam
    "Your password has expired. Click here to reset: [link]",
    "Someone from a new location tried to login to your account.",
    "Verify your account details now or your account will be suspended.",
    "Security Alert: Your passwrd has been compromised. Update now.",
    
    # Impersonation / Authority Scams
    "Myname is bank wala urgent call me",
    "This is the security department. We need to verify your details.",
    "Official notice: Your account is frozen.",
    
    # General typos& Urgency
    "acct blocked plz contact me",
    "need help my money gone",
    "call me now urgent issue",
    "plz send back money wrong acct"
]

# 2. Create the labels for them (all are 'spam', which is 1)
new_labels = ['spam'] * len(new_spam_messages)
new_label_nums = [1] * len(new_spam_messages)

# 3. Create a new DataFrame from our new data
new_data = {
    'label': new_labels,
    'message': new_spam_messages,
    'label_num': new_label_nums
}
new_spam_df = pd.DataFrame(new_data)

# 4. Concatenate (join) our original 'df' with our 'new_spam_df'
# This creates our final, complete dataset
df_augmented = pd.concat([df, new_spam_df], ignore_index=True)

# 5. Check our work!
print("--- After Expanded Augmenting ---")
print("New augmented shape:", df_augmented.shape)
print("\nNew value counts:\n", df_augmented['label'].value_counts())

--- Augmenting Data with Expanded List ---
Original shape: (5572, 3)
Original value counts:
 label
ham     4825
spam     747
Name: count, dtype: int64


--- After Expanded Augmenting ---
New augmented shape: (5591, 3)

New value counts:
 label
ham     4825
spam     766
Name: count, dtype: int64


In [112]:
# now seperating features (X) and target (y)

In [113]:
# We MUST use our new 'df_augmented' DataFrame from now on

# X is our 'feature' - the data we use to make predictions
X = df_augmented['message']

# y is our 'target' - what we *want* to predict
y = df_augmented['label_num']

print(f"Total samples in our new dataset: {len(y)}")

Total samples in our new dataset: 5591


In [114]:
# --- [ 6 ] SPLIT DATA INTO TRAINING AND TESTING SETS ---

# We split our data:
# 80% will be 'training' data
# 20% will be 'testing' data
# stratify=y is VERY important. It makes sure both the train and test 
# sets have the same *percentage* of spam/ham.
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.20, 
    random_state=42, 
    stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Training samples: 4472
Testing samples: 1119


In [115]:
 # vectorize the text (TF-IDF) 

In [116]:
# We need to turn the text messages into a matrix of numbers.
# stop_words='english' tells it to ignore common words like 'the', 'is', 'in', etc.
vectorizer = TfidfVectorizer(stop_words='english')

# First, we 'fit' and 'transform' the training data.
# This builds its vocabulary *including our new spam words*.
X_train_tfidf = vectorizer.fit_transform(X_train)

# For the test data, we ONLY 'transform' it.
# We use the *same* vocabulary it learned from the training data.
X_test_tfidf = vectorizer.transform(X_test)

In [117]:
# fix imbalance with smote

In [118]:
# We apply SMOTE *only* to our training data.
print("--- Applying SMOTE to fix imbalance ---")

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Let's check the new balance of our *training* data
print("Shape before SMOTE:", X_train_tfidf.shape)
print("Shape after SMOTE:", X_train_resampled.shape)
print("\nNew training set balance:")
print(pd.Series(y_train_resampled).value_counts())
print("\n")

--- Applying SMOTE to fix imbalance ---
Shape before SMOTE: (4472, 7467)
Shape after SMOTE: (7718, 7467)

New training set balance:
label_num
0    3859
1    3859
Name: count, dtype: int64




In [119]:
#  train and evaluate model - 1 (Multinomial Naive Bayes) 

In [120]:
print("="*40)
print("   TRAINING: MULTINOMIAL NAIVE BAYES")
print("="*40)

# Initialize the model
nb_model = MultinomialNB()

# Train the model on our new, balanced, augmented data
nb_model.fit(X_train_resampled, y_train_resampled)
print("--- Naive Bayes Model training complete! ---\n")

# --- Evaluation ---
print("="*40)
print("   EVALUATION: MULTINOMIAL NAIVE BAYES")
print("="*40)

# Make predictions on the *original* unseen test data
y_pred_nb = nb_model.predict(X_test_tfidf)

# 1. The Confusion Matrix
print("\n--- Confusion Matrix (Naive Bayes) ---")
cm_nb = confusion_matrix(y_test, y_pred_nb)
print(cm_nb)

# 2. The Classification Report
print("\n--- Classification Report (Naive Bayes) ---")
print(classification_report(y_test, y_pred_nb, target_names=['Ham (0)', 'Spam (1)']))

   TRAINING: MULTINOMIAL NAIVE BAYES
--- Naive Bayes Model training complete! ---

   EVALUATION: MULTINOMIAL NAIVE BAYES

--- Confusion Matrix (Naive Bayes) ---
[[942  24]
 [ 10 143]]

--- Classification Report (Naive Bayes) ---
              precision    recall  f1-score   support

     Ham (0)       0.99      0.98      0.98       966
    Spam (1)       0.86      0.93      0.89       153

    accuracy                           0.97      1119
   macro avg       0.92      0.95      0.94      1119
weighted avg       0.97      0.97      0.97      1119



In [121]:
# train and evaluate model  - 2 (Logistic Regression) 

In [122]:
print("="*40)
print("   TRAINING: LOGISTIC REGRESSION")
print("="*40)

# Initialize the model
lr_model = LogisticRegression(random_state=42)

# Train the model on the *same* resampled data
lr_model.fit(X_train_resampled, y_train_resampled)
print("--- Logistic Regression Model training complete! ---\n")

# --- Evaluation ---
print("="*40)
print("   EVALUATION: LOGISTIC REGRESSION")
print("="*40)

# Make predictions using the *original* test set
y_pred_lr = lr_model.predict(X_test_tfidf)

# 1. The Confusion Matrix
print("\n--- Confusion Matrix (Logistic Regression) ---")
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(cm_lr)

# 2. The Classification Report
print("\n--- Classification Report (Logistic Regression) ---")
print(classification_report(y_test, y_pred_lr, target_names=['Ham (0)', 'Spam (1)']))

   TRAINING: LOGISTIC REGRESSION
--- Logistic Regression Model training complete! ---

   EVALUATION: LOGISTIC REGRESSION

--- Confusion Matrix (Logistic Regression) ---
[[962   4]
 [ 16 137]]

--- Classification Report (Logistic Regression) ---
              precision    recall  f1-score   support

     Ham (0)       0.98      1.00      0.99       966
    Spam (1)       0.97      0.90      0.93       153

    accuracy                           0.98      1119
   macro avg       0.98      0.95      0.96      1119
weighted avg       0.98      0.98      0.98      1119



In [123]:
# testing the models on our tricky message

In [124]:
# Let's use our tricky messages
my_tricky_messages = [
    "Hey, my money accidently transfeered on ur bank", # Should be SPAM
    "Myname is bank wala",  # Should be SPAM
    "Your password has expired. Click here to reset: [link]", # Should be SPAM
    "hey what time is the meeting tomorrow?" # Should be HAM
]

# 1. Transform the messages
my_messages_tfidf = vectorizer.transform(my_tricky_messages)

# 2. Get predictions from BOTH models
my_pred_nb = nb_model.predict(my_messages_tfidf)
my_pred_lr = lr_model.predict(my_messages_tfidf)


# 3. Print a clear comparison
print("="*50)
print("   FINAL TEST ON OUR NEWLY TRAINED MODELS")
print("="*50)

for i in range(len(my_tricky_messages)):
    message = my_tricky_messages[i]
    
    # Naive Bayes prediction
    pred_nb = "SPAM" if my_pred_nb[i] == 1 else "HAM"
    
    # Logistic Regression prediction
    pred_lr = "SPAM" if my_pred_lr[i] == 1 else "HAM"
    
    print(f"\nMessage: \"{message}\"")
    print(f"  -> Naive Bayes Prediction:   {pred_nb}")
    print(f"  -> Logistic Regression Prediction: {pred_lr}")

   FINAL TEST ON OUR NEWLY TRAINED MODELS

Message: "Hey, my money accidently transfeered on ur bank"
  -> Naive Bayes Prediction:   HAM
  -> Logistic Regression Prediction: SPAM

Message: "Myname is bank wala"
  -> Naive Bayes Prediction:   SPAM
  -> Logistic Regression Prediction: SPAM

Message: "Your password has expired. Click here to reset: [link]"
  -> Naive Bayes Prediction:   SPAM
  -> Logistic Regression Prediction: SPAM

Message: "hey what time is the meeting tomorrow?"
  -> Naive Bayes Prediction:   HAM
  -> Logistic Regression Prediction: HAM


In [125]:
# testing the model on our own messages

In [131]:
# e can edit my_message to insert any messages we want to test.
my_messages = [
    "Hey, I'm at the bank, will call you back in 10.", 
    "Your package #4582 is out for delivery. Track here: [link]", 
    "Can you pick up dinner tonight?", 
    "See you at 8",
    "You won! Click link.",
    "(Delivery Service): Your item is scheduled for delivery, but the shipping address is incomplete. Please confirm your details here to avoid a delay" 
]

# This part will remain  the same 

#  Transform the messages
my_messages_tfidf = vectorizer.transform(my_messages)

# 2. Get predictions from BOTH models
my_pred_nb = nb_model.predict(my_messages_tfidf)
my_pred_lr = lr_model.predict(my_messages_tfidf)


# 3.now we are printing  a clear comparison
print("="*50)
print("       MODEL TEST")
print("="*50)

for i in range(len(my_messages)):
    message = my_messages[i]
    
    # Naive Bayes prediction
    pred_nb = "SPAM" if my_pred_nb[i] == 1 else "HAM"
    
    # Logistic Regression prediction
    pred_lr = "SPAM" if my_pred_lr[i] == 1 else "HAM"
    
    print(f"\nMessage: \"{message}\"")
    print(f"  -> Naive Bayes Prediction:   {pred_nb}")
    print(f"  -> Logistic Regression Prediction: {pred_lr}")

       MODEL TEST

Message: "Hey, I'm at the bank, will call you back in 10."
  -> Naive Bayes Prediction:   SPAM
  -> Logistic Regression Prediction: HAM

Message: "Your package #4582 is out for delivery. Track here: [link]"
  -> Naive Bayes Prediction:   SPAM
  -> Logistic Regression Prediction: HAM

Message: "Can you pick up dinner tonight?"
  -> Naive Bayes Prediction:   HAM
  -> Logistic Regression Prediction: HAM

Message: "See you at 8"
  -> Naive Bayes Prediction:   HAM
  -> Logistic Regression Prediction: HAM

Message: "You won! Click link."
  -> Naive Bayes Prediction:   SPAM
  -> Logistic Regression Prediction: SPAM

Message: "(Delivery Service): Your item is scheduled for delivery, but the shipping address is incomplete. Please confirm your details here to avoid a delay"
  -> Naive Bayes Prediction:   SPAM
  -> Logistic Regression Prediction: SPAM
