## Data Preparation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

file_path = 'SMSSpamCollection'
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

# Convert labels to binary values: spam as 1 and ham as 0
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Split the dataset into training, validation, and test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

In [2]:
# Display the first few rows of the dataset
print(df.head())

# Display the shape of the dataset
print("Dataset Shape:", df.shape)

# Basic statistics and check for missing values
print(df.describe())
print("\nMissing Values:", df.isnull().sum())

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
Dataset Shape: (5572, 2)
             label
count  5572.000000
mean      0.134063
std       0.340751
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000

Missing Values: label      0
message    0
dtype: int64


In [3]:
pip install tensorflow_text



## Install Libraries

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

## Load and Preprocess Data

In [5]:
import tensorflow as tf

# Function to convert DataFrame to TensorFlow dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('label')
    ds = tf.data.Dataset.from_tensor_slices((dataframe['message'].values, labels.values))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

# Convert the split data to TensorFlow datasets
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

## Bert Model Selection

In [6]:
import tensorflow_hub as hub

# Selecting BERT model from TensorFlow Hub
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'

tfhub_handle_encoder = f'https://tfhub.dev/tensorflow/{bert_model_name}/1'
tfhub_handle_preprocess = f'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

## Preprocessing Text for BERT

In [7]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

## Build the Classification Model

In [8]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()

## Model Training

In [9]:
# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

# Loss function and metrics
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

# Compile the model
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

# Train the model
history = classifier_model.fit(train_ds,
                               validation_data=val_ds,
                               epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Model Evaluation and Testing

In [10]:
# Evaluate the model
loss, accuracy = classifier_model.evaluate(test_ds)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.04401790723204613
Accuracy: 0.9919282793998718


In [None]:
pip install scikit-learn

In [12]:
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

# Make predictions on the test dataset
y_pred = classifier_model.predict(test_ds)
y_pred_labels = tf.where(y_pred > 0, 1, 0).numpy()  # Convert logits to binary labels

# Extract true labels from test_ds
y_true = np.concatenate([y for x, y in test_ds], axis=0)

# Calculate F1-Score
f1_report = classification_report(y_true, y_pred_labels, target_names=['Ham', 'Spam'])
print("F1 Score and Classification Report:")
print(f1_report)

# Calculate AUC
y_pred_probs = tf.sigmoid(y_pred).numpy()  # Convert logits to probabilities
auc_score = roc_auc_score(y_true, y_pred_probs)
print("Area Under the ROC Curve (AUC):", auc_score)


F1 Score and Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      1.00      1.00       966
        Spam       0.97      0.97      0.97       149

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115

Area Under the ROC Curve (AUC): 0.997936554254033


## Model Saving and Inference

In [None]:
saved_model_path = './spam_classifier_bert'
classifier_model.save(saved_model_path)

In [None]:
import tensorflow as tf
print("Current TensorFlow version:", tf.__version__)