## Spam detection NLP binary classification

dataset source:  
https://www.kaggle.com/datasets/purusinghvi/email-spam-classification-dataset

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
import random




In [2]:
df_path = './big_datasets/spam_emails/combined_data.csv'

In [3]:
spam_df =  pd.read_csv(df_path)

In [4]:
spam_df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [5]:
spam_df_shuffled = spam_df.sample(frac=1)

In [8]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    spam_df_shuffled['text'].to_numpy(),
    spam_df['label'].to_numpy(),
    test_size=0.1)

In [9]:
len(train_sentences), len(val_sentences)

(75103, 8345)

In [10]:
# Find average number of tokens (words) in training Tweets
avg_words = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

In [11]:
max_vocab_len = 20000
max_sentence_len = avg_words

In [13]:
from tensorflow.keras.layers import TextVectorization

In [14]:
# here we're initializing a tokenizer with a set output sequence lenght (to help with batching)
# plus we're setting max_tokens to limit the tokens considered to the 10000 most common ones
text_vectorizer = TextVectorization(max_tokens=max_vocab_len,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=max_sentence_len
                                   )




In [15]:
text_vectorizer.adapt(train_sentences)




#### Pretrained embeddings

In [16]:
# Example of pretrained embedding with universal sentence encoder - https://tfhub.dev/google/universal-sentence-encoder/4
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # load Universal Sentence Encoder
















In [17]:
inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
x = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4', 
                    trainable=False)(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_use = tf.keras.models.Model(inputs, outputs, name="model_6_USE")

In [18]:
# Compile model
model_use.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_use.summary()

Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 64)                32832     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256830721 (979.73 MB)
Trainable params: 32897 (128.50 KB)
Non-trainable params: 256797824 (979.61 MB)
_________________________________________________________________


In [19]:
# Train a classifier on top of pretrained embeddings
model_use_history = model_use.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [23]:
# Make predictions with USE TF Hub model
model_use_pred_probs = model_use.predict(val_sentences)
# Convert prediction probabilities to labels
model_use_preds = tf.squeeze(tf.round(model_use_pred_probs))
# Calculate model use performance metrics
model_use_results = calculate_results(val_labels, model_use_preds)
model_use_results



{'accuracy': 51.036548831635706,
 'precision': 0.49947542512182286,
 'recall': 0.5103654883163571,
 'f1': 0.48903435318569755}