In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import wordnet as wn
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import load_model






# Loading the Dataset and tokenizing 

In [2]:
# Load the dataset
df = pd.read_csv('Spam SMS Collection.txt', sep='\t', names=['label', 'message'])

# Map labels to numerical values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Using the wordnet and calculating the similarity for classification

In [3]:
def calculate_semantic_similarity(text, reference_words):
    score = 0
    word_synsets = [wn.synsets(word) for word in text.lower().split()]
    ref_synsets = [wn.synsets(word) for word in reference_words]
    for word in word_synsets:
        for ref in ref_synsets:
            if word and ref:
                word_best = max((word[0].path_similarity(ref_word), ref_word) for ref_word in ref if word[0].path_similarity(ref_word) is not None)
                score += word_best[0] if word_best[0] is not None else 0
    return score / len(text.split())

In [4]:
# Preprocessing and preparing BERT inputs
def prepare_texts(texts):
    input_ids = []
    attention_masks = []
    semantic_scores = []

    spam_words = ['free', 'win', 'winner', 'urgent', 'alert', 'claim', 'prize', 'congratulations', 'guaranteed', 'offer']

    for text in texts:
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        semantic_score = calculate_semantic_similarity(text, spam_words)
        
        input_ids.append(inputs['input_ids'][0])
        attention_masks.append(inputs['attention_mask'][0])
        semantic_scores.append([semantic_score])
        
    return np.array(input_ids), np.array(attention_masks), np.array(semantic_scores)


In [5]:
X_ids, X_masks, X_semantics = prepare_texts(df['message'])
y = df['label'].values

# Split the dataset
X_train_ids, X_test_ids, X_train_masks, X_test_masks, X_train_semantics, X_test_semantics, y_train, y_test = train_test_split(X_ids, X_masks, X_semantics, y, test_size=0.2, random_state=42)

# Load BERT model
bert = TFBertModel.from_pretrained('bert-base-uncased')

# Freeze BERT layers
for layer in bert.layers:
    layer.trainable = False

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [6]:
# Build the model
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
input_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")
semantic_input = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name="semantic_score")

embeddings = bert(input_ids, attention_mask=input_mask)[1]
concat = tf.keras.layers.concatenate([embeddings, semantic_input])

x = tf.keras.layers.Dense(1024, activation='relu')(concat)
x = tf.keras.layers.Dropout(0.1)(x)
y = tf.keras.layers.Dense(1, activation='sigmoid')(x)

In [7]:
model = tf.keras.Model(inputs=[input_ids, input_mask, semantic_input], outputs=y)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [17]:
# Train the model
model.fit([X_train_ids, X_train_masks, X_train_semantics], y_train, batch_size=32, validation_split=0.2, epochs=10)

# Evaluate the model
model.evaluate([X_test_ids, X_test_masks, X_test_semantics], y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.06259728968143463, 0.9829596281051636]

# Function for Predicting a message( Spam or Ham )

In [18]:
def predict_spam(text):
    # Prepare the text inputs for BERT
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf',
    )
    
    # Calculate semantic similarity
    spam_words = ['free', 'win', 'winner', 'urgent', 'alert', 'claim', 'prize', 'congratulations', 'guaranteed', 'offer']
    semantic_score = calculate_semantic_similarity(text, spam_words)
    
    # Prepare the input dictionary to match the model's input format
    input_dict = {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'semantic_score': np.array([[semantic_score]])  # Ensure it's in the same shape as during training
    }
    
    # Make prediction
    prediction = model.predict(input_dict)[0]
    
    # Determine the class and probability
    class_id = 1 if prediction >= 0.5 else 0
    probability = prediction[0]
    
    if class_id == 1:
        return "Spam", probability
    else:
        return "Ham", probability

# Example usage
text = "you've won £1000 cash! To get your money, text ‘CLAIM’ to 81010 now! Cost £3.00 per msg."
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")


The message is classified as Spam with a probability of 0.9982


In [19]:
text = "Congratulations! You've been selected to win a free iPhone! Click here to claim your prize now!"
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")

The message is classified as Spam with a probability of 0.9401


In [20]:
text = "Hey, are we still on for dinner tonight? Looking forward to catching up!"
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")

The message is classified as Ham with a probability of 0.0141


In [21]:
text = "URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot!"
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")

The message is classified as Spam with a probability of 0.9895


In [22]:
text = "Can you please send me the directions to the park? I forgot to save them last time."
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")

The message is classified as Ham with a probability of 0.0047


In [23]:
text = "Just finished the meeting, I'll call you in 5 minutes."
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")

The message is classified as Ham with a probability of 0.0251


In [24]:
#Save the model
model.save('models/NLP_SPAM_model.h5')# saves to HDF5 file
model.save_weights('models/NLP_SPAM_weights.h5')
print("Model saved successfully.")

  saving_api.save_model(


Model saved successfully.


In [25]:
text = '''Hey there! Are you tired of the same old routine? Looking for something new and exciting to spice up your life? Well, look no further! Introducing the amazing, the incredible, the life-changing product that will revolutionize the way you live! Say goodbye to boredom and hello to endless possibilities with our revolutionary solution.

But wait, there's more! If you act now, you'll receive a special discount that's too good to pass up. This offer won't last long, so don't miss out on the opportunity to transform your life for the better. Whether you're at home, at work, or on the go, our product will enhance every aspect of your daily life.

Thousands of satisfied customers can't be wrong! Join the ranks of the happy and fulfilled individuals who have already experienced the incredible benefits of our product. Don't hesitate, take the leap and unlock a world of excitement and joy!

So what are you waiting for? Take the first step towards a brighter future and seize this amazing opportunity today. Remember, this is a limited-time offer, so act now and start living your best life!'''
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")

The message is classified as Spam with a probability of 0.5737


In [26]:
text = '''Hey there! Just wanted to remind you about the upcoming team meeting on Monday at 10 am. We'll be discussing the latest project updates and setting goals for the next phase. Your input will be valuable, so make sure to come prepared with any insights or suggestions you might have. Looking forward to a productive session!


In other news, the company picnic is just around the corner, and we're all excited to spend a fun day outdoors. Don't forget to sign up for the potluck and let us know what dish you'll be bringing. It's always a great opportunity to bond with colleagues outside of work.


On a personal note, I wanted to share that I recently read a fascinating book that I think you'd enjoy. It's a gripping mystery novel with an unexpected twist at the end. Let me know if you'd like to borrow it sometime.


Lastly, I hope you have a fantastic weekend ahead! Take some time to relax and recharge for the week ahead. If there's anything you need assistance with, feel free to reach out. Have a great day'''
text=text.lower()
classification, probability = predict_spam(text)
print(f"The message is classified as {classification} with a probability of {probability:.4f}")

The message is classified as Ham with a probability of 0.1773
