In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dropout, Bidirectional, Dense, Embedding
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
# This is just for use google colab

# from google.colab import drive
# drive.mount('/content/drive')
# %cd drive/MyDrive/develop/safe_chat

In [None]:
MODEL_ITERATION = 8

## Prepare Training Data

In [None]:
train_df = pd.read_csv(os.path.join("data", "train.csv"))
train_df.head()

In [None]:
vectorizer = TextVectorization(max_tokens=200000, output_sequence_length=1800)
vectorizer.adapt(train_df["comment_text"].values)

In [None]:
vectorizer.get_vocabulary()[:15]

In [None]:
X = vectorizer(train_df["comment_text"].values)
y = train_df.iloc[:, 2:]

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.cache()
dataset = dataset.shuffle(10000)
dataset = dataset.batch(32)
dataset = dataset.prefetch(16)

In [None]:
train = dataset.take(int(len(dataset) * 0.7))
validate = dataset.skip(int(len(dataset) * 0.7)).take(int(len(dataset) * 0.2))
test = dataset.skip(int(len(dataset) * 0.9)).take(int(len(dataset) * 0.1))

## Create Model

In [None]:
model = Sequential()

model.add(Embedding(200000 + 1, 128))

model.add(Bidirectional(GRU(128, kernel_regularizer=l2())))
model.add(Dropout(0.5))

model.add(Dense(6, activation="sigmoid"))

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam")
model.summary()

## Train Model

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
checkpoint = ModelCheckpoint(os.path.join("checkpoints", f"checkpoint_{MODEL_ITERATION}.keras"), save_best_only=True)

In [None]:
history = model.fit(train, epochs=12, validation_data=validate, callbacks=[early_stopping, checkpoint])

## Evaluate Model

In [None]:
plt.figure(figsize=(12, 8))
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.title(f"Epoch Loss - Model {MODEL_ITERATION}")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.savefig(os.path.join("graphs", f"epoch_loss_{MODEL_ITERATION}.png"))

In [None]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    X_test, y_test = batch
    y_hat = model.predict(X_test)

    y_test = y_test.flatten()
    y_hat = y_hat.flatten()

    precision.update_state(y_test, y_hat)
    recall.update_state(y_test, y_hat)
    accuracy.update_state(y_test, y_hat)

In [None]:
stats_df = pd.DataFrame({"model" : MODEL_ITERATION, "precision" : [precision.result().numpy()], "recall" : [recall.result().numpy()], "accuracy" : [accuracy.result().numpy()]})
stats_df.to_csv(os.path.join("model_stats.csv"), mode='a', index=False, header=False)