In [None]:
#Install required libraries (for Colab environment)
!pip install -U "tensorflow-text==2.15.*"
!pip install -U "tf-models-official==2.15.*"



In [None]:
#Imports
import numpy as np
import pandas as pd
import kagglehub
import os
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras import layers,Model,metrics

In [None]:
# # Download and explore dataset
# -----------------------------
# Download spam dataset from KaggleHub
path = kagglehub.dataset_download("ozlerhakan/spam-or-not-spam-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/spam-or-not-spam-dataset


In [None]:
# Show all files in the dataset directory
print("Downloaded to:", path)

for root, dirs, files in os.walk(path):
    for file in files:
        print(os.path.join(root, file))

Downloaded to: /kaggle/input/spam-or-not-spam-dataset
/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv


In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/spam-or-not-spam-dataset/spam_or_not_spam.csv')
print('Shape of data:', df.shape)

Shape of data: (3000, 2)


In [None]:
# Display first few rows
df.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [None]:
#Analyze and balance the dataset
#--------------------------------------------------------------------
# Check if dataset is imbalanced
print("number of spam and not spam email",df['label'].value_counts())

number of spam and not spam email label
0    2500
1     500
Name: count, dtype: int64


In [None]:
# Downsample non-spam examples to balance the dataset
df_spam=df[df.label==1]
blnc_num=df_spam.shape[0]
df_not_spam=df[df.label==0]
df_not_spam_blnced=df_not_spam.sample(blnc_num)
df_blnced=pd.concat([df_spam,df_not_spam_blnced])
print('The shape of the balanced dataset:',df_blnced.shape)

The shape of the balanced dataset: (1000, 2)


In [None]:
#display some rows of the balanced dataset
print(df_blnced.head())

                                                  email  label
2500   save up to NUMBER on life insurance why spend...      1
2501  NUMBER fight the risk of cancer URL NUMBER sli...      1
2502  NUMBER fight the risk of cancer URL NUMBER sli...      1
2503   adult club offers free membership instant acc...      1
2504  i thought you might like these NUMBER slim dow...      1


In [None]:
#Prepare data for training
# -----------------------------------------------------------------
# Define X (emails) and Y (spam or not)
X=df_blnced['email']
Y=df_blnced['label']

In [None]:
# Split into training and testing sets
X_train,X_test,Y_train,Y_test=train_test_split(X,Y)

# Ensure inputs are string type and labels are float type
X_train = pd.Series(X_train).astype(str).to_numpy()
X_test = pd.Series(X_test).astype(str).to_numpy()
Y_train = pd.Series(Y_train).fillna(0).astype(float).to_numpy()
Y_test = pd.Series(Y_test).fillna(0).astype(float).to_numpy()

print('shape of X_train:',X_train.shape)
print('shape of Y_train:',Y_train.shape)
print('shape of X_test:',X_test.shape)
print('shape of Y_tesr:',Y_test.shape)

shape of X_train: (750,)
shape of Y_train: (750,)
shape of X_test: (250,)
shape of Y_tesr: (250,)


In [None]:
#Load BERT preprocessing and encoder models

# ----------------------------------------------------------------------
# Preprocessing model: tokenization, lowercasing, adding special tokens
bert_preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3")

# BERT encoder: generates embeddings
bert_encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4")

In [None]:
#Test BERT output (optional check)
# -------------------------------------------------------------------------
# Encode a few sample sentences
def get_sentence(sentence):
  preprocessed_text=bert_preprocessor(sentence)
  return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
# Example encoding
e=get_sentence(['banana','apple','bill gates'])
print(e)

<tf.Tensor: shape=(3, 768), dtype=float32, numpy=
array([[-0.76069176, -0.1421939 ,  0.4960461 , ...,  0.4216533 ,
        -0.532214  ,  0.8031217 ],
       [-0.81964564, -0.29609606,  0.20951776, ...,  0.25593394,
        -0.5874299 ,  0.8434556 ],
       [-0.7854439 , -0.299497  ,  0.41027346, ...,  0.5222537 ,
        -0.4957358 ,  0.8150751 ]], dtype=float32)>

In [None]:
# # Compute cosine similarity between 'banana' and 'apple'
cosine_similarity([e[0]],[e[1]])

array([[0.95718384]], dtype=float32)

In [None]:
#Build spam classification model
# -------------------------------------------------------------------------------
# Define model input
text_input = layers.Input(shape=(), dtype=tf.string)

# Preprocess and encode text
encoder_inputs = bert_preprocessor(text_input)
outputs=bert_encoder(encoder_inputs)

# Add dropout for regularization
l=layers.Dropout(0.1,name='dropout')(outputs['pooled_output'])

# Add dense layers
l=layers.Dense(128,activation='relu')(l)
l=layers.Dense(1,activation='sigmoid',name='output')(l)

# Create the model
model = Model(inputs=[text_input], outputs=[l])

#Compile the model
METRICS=[
    metrics.BinaryAccuracy(name='Accuracy'),
    metrics.Precision(name='precision'),
    metrics.Recall(name='recall')
]
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

## Display model summary
model.summary()

#Train the model
model.fit(X_train,Y_train,epochs=15)

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 keras_layer_2 (KerasLayer)  {'input_type_ids': (None,    0         ['input_9[0][0]']             
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                               
                                                                                            

<keras.src.callbacks.History at 0x7f3d1df4ea10>

In [None]:

#Evaluate the model
# ------------------------------------------------------------------------------
model.evaluate(X_test,Y_test)



[0.33429020643234253,
 0.8799999952316284,
 0.8311688303947449,
 0.9696969985961914]