In [1]:
!pip install -U "tensorflow-text==2.13.*"



In [2]:
!pip install "tf-models-official==2.13.*"



In [3]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
from sklearn.model_selection import train_test_split
from official.nlp import optimization

In [4]:
file_path = 'phishing_site_urls.csv'

# Load the dataset
data = pd.read_csv(file_path, on_bad_lines='skip')

# Display the first few rows of the dataset
print("Original Dataset Head:")
print(data.head())

# Display the shape of the original dataset
print("\nOriginal Dataset Shape:", data.shape)

# Basic statistics and check for missing values in the original dataset
print("\nOriginal Dataset Basic Statistics:")
print(data.describe(include='all'))
print("\nOriginal Dataset Missing Values:", data.isnull().sum())

# Subsample the data while maintaining the class distribution
subsample_data = data.groupby('Label').sample(n=15000, random_state=42)

# Convert labels from 'good'/'bad' to binary in the subsample
subsample_data['Label'] = subsample_data['Label'].map({'good': 0, 'bad': 1})

# Display class balance in the subsampled dataset
class_balance = subsample_data['Label'].value_counts(normalize=True) * 100
print("\nClass Balance in Subsampled Dataset (%):")
print(class_balance)

# Split the subsample into training, validation, and test sets
train_data_sub, test_data_sub = train_test_split(subsample_data, test_size=0.2, random_state=42)
train_data_sub, val_data_sub = train_test_split(train_data_sub, test_size=0.2, random_state=42)

# Display the first few rows of the subsampled dataset
print("\nSubsampled Dataset Head:")
print(subsample_data.head())

# Display the shape of the subsampled dataset
print("\nSubsampled Dataset Shape:", subsample_data.shape)

# Basic statistics and check for missing values in the subsampled dataset
print("\nSubsampled Dataset Basic Statistics:")
print(subsample_data.describe(include='all'))
print("\nSubsampled Dataset Missing Values:", subsample_data.isnull().sum())

# Display the sizes of the training, validation, and test sets
print("\nTraining Set Size:", len(train_data_sub))
print("Validation Set Size:", len(val_data_sub))
print("Test Set Size:", len(test_data_sub))

Original Dataset Head:
                                                 URL Label
0  nobell.it/70ffb52d079109dca5664cce6f317373782/...   bad
1  www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...   bad
2  serviciosbys.com/paypal.cgi.bin.get-into.herf....   bad
3  mail.printakid.com/www.online.americanexpress....   bad
4  thewhiskeydregs.com/wp-content/themes/widescre...   bad

Original Dataset Shape: (549346, 2)

Original Dataset Basic Statistics:
                                  URL   Label
count                          549346  549346
unique                         507195       2
top     jhomitevd2abj3fk.tor2web.org/    good
freq                               52  392924

Original Dataset Missing Values: URL      0
Label    0
dtype: int64

Class Balance in Subsampled Dataset (%):
1    50.0
0    50.0
Name: Label, dtype: float64

Subsampled Dataset Head:
                                                      URL  Label
96574   tube8vidsjhn.ddns.name/2013/girl-fucked-by-dog...      1
52579

In [5]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Label')
    ds = tf.data.Dataset.from_tensor_slices((dataframe['URL'], labels.astype('int32')))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

# Convert the subsampled DataFrame into TensorFlow datasets
batch_size = 32  # Adjust the batch size as needed
train_ds = df_to_dataset(train_data_sub, batch_size=batch_size)
val_ds = df_to_dataset(val_data_sub, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test_data_sub, shuffle=False, batch_size=batch_size)

In [6]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [7]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
    return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()

In [8]:
# Loss function, optimizer, and metrics
loss = tf.keras.losses.BinaryCrossentropy()
metrics = [tf.metrics.BinaryAccuracy(name='accuracy'),
           tf.metrics.Precision(name='precision'),
           tf.metrics.Recall(name='recall')]

epochs = 3
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

# Train the model
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
# Evaluate the model
loss, accuracy, precision, recall = classifier_model.evaluate(test_ds)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

Loss: 0.1416020542383194
Accuracy: 0.9578333497047424
Precision: 0.9675476551055908
Recall: 0.9488189220428467


In [10]:
# Save the model
saved_model_path = './saved_model/my_bert_model'
classifier_model.save(saved_model_path, include_optimizer=False)

In [11]:
model_save_path = '/content/drive/My Drive/Colab Notebooks/my_sigmoid_bert_model'
classifier_model.save(model_save_path, include_optimizer=False)
