In [38]:
%load_ext autoreload
%autoreload 2

from NLP_Lib import *

import tensorflow as tf
from tensorflow import keras
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import math
import numpy as np
import tensorflow_text as text
import tensorflow_hub as hub

from official.nlp import optimization # for creating custom optimizer

from googletrans import constants # for translating to other langs

from tensorflow.data import Dataset

# disable eager execution to use ELMo model
# tf.disable_eager_execution()

# improve/change plot appearance
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# load in the training data, make sure it is the form we expect
training_df = pd.read_table(PATH_TO_TRAINING_TSV)
assert (training_df.shape[1] > 1 and CLASS_COL == training_df.columns[1]
    and training_df.shape[1] > 2 and SUBCLASS_COL == training_df.columns[2]
    and training_df.shape[1] > 3 and TEXT_COL == training_df.columns[3])

# shuffle
training_df = training_df.sample(frac=1).reset_index(drop=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# clean the data

# remove any single word text column instances
cleaned_training_df = training_df[training_df[TEXT_COL].str.split().str.len() > 1]

In [None]:
'''
Used the following method to grow our dataset
increase_training_data_via_language_traslation(
    cleaned_training_df,
    0.001,
    CLASSES,
    PATH_TO_TRAINING_DATASET_STRUCT)
'''

In [None]:
'''
Use the method 
"make_df_into_ds_file_form"
in our Lib.py file to convert the training data .tsv into a folder/file structure like:
./resources/training/
___Bart/
_______bart_instance_id_r.txt
_______bart_instance_id_q.txt
_______ ...
___Homer/
________homer_instance_id_k.txt
________homer_instance_id_n.txt
________ ...
___...

Then use the path to the root directory of the folder/file structure you just made, so
for the above example, I let ./resources/training/ be "PATH_TO_TRAINING_DATASET_STRUCT"
below and use
tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size)
to read the data set into a special form that is recognized by keras NN layers, as well
as the instances are in the given "batch_size", which helps prevent overflowing RAM.
'''

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42
validation_split=0.20

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size=batch_size,
    label_mode="categorical",
    validation_split=validation_split,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.shuffle(100).cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size=batch_size,
    label_mode="categorical",
    validation_split=validation_split,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# Clean this data?

# Can use the following to inspect the dataset structure made from the file folder struct
for text_batch, label_batch in train_ds.take(1):
  for i in range(3):
    print(f'Text: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Class : {label} ({class_names[np.nonzero(label)[0][0]]})\n')

    
texts, classes = [], []
for x, label in val_ds:
  texts.append(x)
  classes.append(label)



In [None]:
# Define the model with checkpoint saves, parmas, options
epochs = 65
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.3*num_train_steps)
init_lr = 5e-5
optimizer_type = "adamw"
optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type=optimizer_type)  # SGD, RMSprop, Adam, Adadelta, Adagrad, Adamax Nadam, Ftrl

model = build_classifier_model(SMALL_BERT[0], SMALL_BERT[1], optimizer)

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

print(model.summary())

In [None]:
# set up a unique folder for this model and set up check points
checkpoint_dir_x = os.path.join(
    PATH_TO_MODELS_DIRECTORY,
    datetime.now().strftime("%m/%d/%H:%M:%S"))
pathlib.Path(checkpoint_dir_x).mkdir(parents=True, exist_ok=True)

call_backs = [
    keras.callbacks.ModelCheckpoint(
        os.path.join(checkpoint_dir_x, "save_at_{epoch}.h5"),
        save_weights_only=True),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5), # stop if the validation loss goes up for 5 epochs
]

# Train!
history = model.fit(
    x=train_ds.with_options(options),
    validation_data=val_ds.with_options(options),
    verbose=1, # 0 is silent, 1 for loading bar, 2 for stats each epoch
    epochs=epochs,
    callbacks=call_backs,
    shuffle=True)

In [None]:
# Save the model summary, plus it's hyper params, epoch count, optimizer type, etc to a json to be
# reloaded later.
# Plot the mdoel's change in training and validation accuracy and loss over epochs.
# Plot a confusion matrix using the validation data.
print_model_summary_to_file(model, checkpoint_dir_x)
save_training_params(
    validation_split,
    epochs,
    steps_per_epoch,
    num_train_steps,
    num_warmup_steps,
    init_lr,
    optimizer_type,
    checkpoint_dir_x)
plot_accuracy(plt, history, checkpoint_dir_x)
plot_loss(plt, history, checkpoint_dir_x)
confusion_matrix(plt, val_ds, model, checkpoint_dir_x)