In [9]:
%load_ext autoreload
%autoreload 2

from NLP_Lib import *

import tensorflow as tf
from tensorflow import keras
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import math
import numpy as np
from numpy import dtype
import tensorflow_text as text
import tensorflow_hub as hub

from official.nlp import optimization # for creating custom optimizer

from googletrans import constants # for translating to other langs

from tensorflow.data import Dataset

# disable eager execution to use ELMo model
# tf.disable_eager_execution()

# improve/change plot appearance
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# load in the training data, make sure it is the form we expect
training_df = pd.read_table(PATH_TO_TRAINING_TSV)
assert (training_df.shape[1] > 1 and CLASS_COL == training_df.columns[1]
    and training_df.shape[1] > 2 and SUBCLASS_COL == training_df.columns[2]
    and training_df.shape[1] > 3 and TEXT_COL == training_df.columns[3])

# shuffle
training_df = training_df.sample(frac=1).reset_index(drop=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
# clean the data

# remove any single word text column instances
cleaned_training_df = training_df[training_df[TEXT_COL].str.split().str.len() > 1]

In [None]:
'''
Used the following method to grow our dataset
increase_training_data_via_language_traslation(
    cleaned_training_df,
    0.001,
    CLASSES,
    PATH_TO_TRAINING_DATASET_STRUCT)
'''

In [10]:
'''
Use the method 
"make_df_into_ds_file_form"
in our Lib.py file to convert the training data .tsv into a folder/file structure like:
./resources/training/
___Bart/
_______bart_instance_id_r.txt
_______bart_instance_id_q.txt
_______ ...
___Homer/
________homer_instance_id_k.txt
________homer_instance_id_n.txt
________ ...
___...

Then use the path to the root directory of the folder/file structure you just made, so
for the above example, I let ./resources/training/ be "PATH_TO_TRAINING_DATASET_STRUCT"
below and use
tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size)
to read the data set into a special form that is recognized by keras NN layers, as well
as the instances are in the given "batch_size", which helps prevent overflowing RAM.
'''

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size=batch_size,
    label_mode="categorical",
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size=batch_size,
    label_mode="categorical",
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 104874 files belonging to 5 classes.
Using 83900 files for training.
Found 104874 files belonging to 5 classes.
Using 20974 files for validation.


In [30]:
# Clean this data?

# Can inspect the dataset structure made from the file folder struct
for text_batch, label_batch in train_ds.take(1):
  for i in range(3):
    print(f'Text: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Class : {label} ({class_names[np.nonzero(label)[0][0]]})\n')

Text: b"Don't worry. If I croak, you'll marry Lenny. Or Moe -- the winner will be determined by a card game I invented. I got all the rules written down... up here."
Class : [0. 1. 0. 0. 0.] (Homer Simpson)

Text: b'Now what have you done, Simpson?'
Class : [0. 0. 0. 0. 1.] (Other)

Text: b'Anything. You name it. What do you want to do?'
Class : [0. 1. 0. 0. 0.] (Homer Simpson)



2022-10-16 09:56:31.267567: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [None]:
epochs = 65
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-5
optimizer_type = "adamw"
optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type=optimizer_type)  # SGD, RMSprop, Adam, Adadelta, Adagrad, Adamax Nadam, Ftrl

model = build_classifier_model(SMALL_BERT[0], SMALL_BERT[1])

model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=tf.metrics.CategoricalAccuracy())

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

checkpoint_dir_x = os.path.join(
    CHECKPOINT_DIR,
    datetime.now().strftime("%m/%d/%H:%M:%S"))
pathlib.Path(checkpoint_dir_x).mkdir(parents=True, exist_ok=True)
call_backs = [
    keras.callbacks.ModelCheckpoint(
        os.path.join(checkpoint_dir_x, "save_at_{epoch}.h5"),
        save_weights_only=True),
]

model.summary()

In [None]:
history = model.fit(
    x=train_ds.with_options(options),
    validation_data=val_ds.with_options(options),
    verbose=1, # 0 is silent, 1 for loading bar, 2 for stats each epoch
    epochs=epochs,
    callbacks=call_backs)

save_training_params(
    epochs,
    steps_per_epoch,
    num_train_steps,
    num_warmup_steps,
    init_lr,
    optimizer_type,
    checkpoint_dir_x)
plot_accuracy(plt, history, checkpoint_dir_x)
plot_loss(plt, history, checkpoint_dir_x)
confusion_matrix(plt, val_ds, model, checkpoint_dir_x)