In [None]:
# Cell 1: Import necessary libraries
import os
import re
import string
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_docs as tfdocs
from datetime import datetime as dt
import keras_tuner as kt

print(f"TensorFlow version:{tf.__version__}")

In [None]:
# Cell 1.5: Set up GPU
physical_devices = tf.config.list_physical_devices('GPU')

In [None]:
# Cell 1.6: Load TensorBoard And Kill old logs
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
os.system('rm -rf ./logs/')

clear_previous_training = True
if clear_previous_training:
    os.system('rm -rf AreYouMad')

In [None]:
# Cell 2: Define constants and load datasets
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'OrganizedDataToxic', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'OrganizedDataToxic', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'OrganizedDataToxic', 
    batch_size=batch_size)

In [None]:
# Cell 3: Define custom text standardization function
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [None]:
# Cell 4: Define text vectorization parameters and create vectorization layer
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [None]:
# Cell 5: Adapt the vectorization layer to the training data
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
# Cell 6: Define functions for vectorizing text data
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
# Cell 7: Configure dataset caching and prefetching
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
embedding_dim = 16

def model_builder(hp):

  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
  hp_units = hp.Int('units', min_value=32, max_value=512, step=32)

  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features + 1, embedding_dim),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(hp_units, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),])
  
  model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=hp_learning_rate),
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                metrics=['accuracy'])
  
  return model

In [None]:
tuner = kt.RandomSearch(model_builder,
                     objective='val_accuracy',
                     max_trials=10,
                     directory='AreYouMad',
                     project_name='AreYouMad')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
log_dir = "logs/fit/" + dt.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# Start the hyperparameter search
tuner.search(train_ds, epochs=50, validation_data=val_ds, callbacks=[stop_early, tensorboard_callback])

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(tuner.get_best_hyperparameters(num_trials=1)[0].values)

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(train_ds, validation_data=val_ds, epochs=50, callbacks=[stop_early, tensorboard_callback])

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
history = hypermodel.fit(train_ds, validation_data=val_ds, epochs=best_epoch, callbacks=[stop_early, tensorboard_callback])

In [None]:
# Cell 11: Plot training and validation loss
history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.style.use('ggplot')
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'c', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
# Cell 12: Plot training and validation accuracy
plt.plot(epochs, acc, 'c', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
# Cell 13: Define export model and compile it
export_model = tf.keras.Sequential([
  vectorize_layer,
  hypermodel,
  tf.keras.layers.Activation('sigmoid')
])

export_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=best_hps.get('learning_rate')),
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                metrics=['accuracy'])

In [95]:
# Cell 14: Evaluate the export model on the test dataset
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

#This is wack

0.18331651389598846


In [None]:
def Is_Toxic(text: str, threshold: int):
    MessageList = [text]
    predictions = export_model.predict(tf.constant(MessageList))

    if predictions[0][0] >= threshold:
        return (True, predictions[0][0])
    else:
        return (False, predictions[0][0])

In [None]:
print(Is_Toxic("I love you :)", 0.7))

In [None]:
def plot_toxicity(predictions: list, examples: list):
    plot_examples = range(len(examples))
    toxicity_predictions = predictions.flatten()

    # Create a bar graph
    plt.bar(plot_examples, toxicity_predictions, color=['lightblue',])

    # Add labels and title
    plt.xlabel('Examples')
    plt.ylabel('Toxicity Probability')
    plt.title('Toxicity Probability of Examples')

    # Add x-axis labels for each example
    plt.xticks(plot_examples, examples, rotation=45, ha='right')

    # Display the graph
    plt.show()

In [None]:
# Cell 16: Save the export model
will_save = input("Save the model? (y/n) ")
if will_save == "y":
    export_model.save('UMsave.keras')
else:
    print("Model not saved.")