In [84]:
%load_ext autoreload
%autoreload 2

from NLP_Lib import *

import tensorflow as tf
from tensorflow import keras
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import math
import numpy as np
from numpy import dtype
import tensorflow_text as text
import tensorflow_hub as hub

from googletrans import constants

from tensorflow.data import Dataset

# disable eager execution to use ELMo model
# tf.disable_eager_execution()

# improve/change plot appearance
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# load in the training data, make sure it is the form we expect
training_df = pd.read_table(PATH_TO_TRAINING_TSV)
assert (training_df.shape[1] > 1 and CLASS_COL == training_df.columns[1]
    and training_df.shape[1] > 2 and SUBCLASS_COL == training_df.columns[2]
    and training_df.shape[1] > 3 and TEXT_COL == training_df.columns[3])

# shuffle
training_df = training_df.sample(frac=1).reset_index(drop=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
# clean the data

# remove any single word text column instances
cleaned_training_df = training_df[training_df[TEXT_COL].str.split().str.len() > 1]

In [88]:
for key in constants.LANGUAGES:
    try:
        # dont translate english to english then back to enlgish
        if lang_i is not "en":
            lang_i = constants.LANGUAGES[key]
            print(lang_i)
            translated_df = translate_df_to_and_back(key, cleaned_training_df, 0.01, CLASSES)
            make_df_into_ds_file_form(translated_df, PATH_TO_TRAINING_DATASET_STRUCT, CLASSES, f"{lang_i}_")
    except Exception:
        print("Error on language: ", lang_i)

afrikaans
albanian
amharic
arabic
armenian
azerbaijani
basque
belarusian
bengali
bosnian
bulgarian
catalan
cebuano
chichewa
chinese (simplified)
chinese (traditional)
corsican
croatian
czech
danish
dutch
english
esperanto
estonian
filipino
finnish
french
frisian
galician
georgian
german
greek
gujarati
haitian creole
hausa
hawaiian
hebrew
hebrew
hindi
hmong
hungarian
icelandic
igbo
indonesian
irish
italian
japanese
javanese
kannada
kazakh
khmer
korean
kurdish (kurmanji)
kyrgyz
lao
latin
latvian
lithuanian
luxembourgish
macedonian
malagasy
malay
malayalam
maltese
maori
marathi
mongolian
myanmar (burmese)
nepali
norwegian
odia
pashto
persian
polish
portuguese
punjabi
romanian
russian
samoan
scots gaelic
serbian
sesotho
shona
sindhi
sinhala
slovak
slovenian
somali
spanish
sundanese
swahili
swedish
tajik
tamil
telugu
thai
turkish
ukrainian
urdu
uyghur
uzbek
vietnamese
welsh
xhosa
yiddish
yoruba
zulu


In [None]:
'''
Use the method 
"make_df_into_ds_file_form"
in our Lib.py file to convert the training data .tsv into a folder/file structure like:
./resources/training/
___Bart/
_______bart_instance_id_r.txt
_______bart_instance_id_q.txt
_______ ...
___Homer/
________homer_instance_id_k.txt
________homer_instance_id_n.txt
________ ...
___...

Then use the path to the root directory of the folder/file structure you just made, so
for the above example, I let ./resources/training/ be "PATH_TO_TRAINING_DATASET_STRUCT"
below and use
tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size)
to read the data set into a special form that is recognized by keras NN layers, as well
as the instances are in the given "batch_size", which helps prevent overflowing RAM.
'''

AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size=batch_size,
    label_mode="categorical",
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    PATH_TO_TRAINING_DATASET_STRUCT,
    batch_size=batch_size,
    label_mode="categorical",
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
# Clean this data?


In [None]:
model = build_classifier_model(SMALL_BERT[0], SMALL_BERT[1])
model.compile(
    optimizer=tf.keras.optimizers.Adam(), # SGD, RMSprop, Adam, Adadelta, Adagrad, Adamax Nadam, Ftrl
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=["accuracy"])

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

checkpoint_dir_path_id = datetime.now().strftime("%m/%d/%H:%M:%S")
checkpoint_dir_x = os.path.join(CHECKPOINT_DIR, checkpoint_dir_path_id)
pathlib.Path(checkpoint_dir_x).mkdir(parents=True, exist_ok=True)
call_backs = [
    keras.callbacks.ModelCheckpoint(
        os.path.join(checkpoint_dir_x, "save_at_{epoch}.h5"),
        save_weights_only=True),
]

model.summary()

In [None]:
epochs = 50
history = model.fit(
    x=train_ds.with_options(options),
    validation_data=val_ds.with_options(options),
    verbose=1, # 0 is silent, 1 for loading bar, 2 for stats each epoch
    epochs=epochs,
    callbacks=call_backs)


plot_accuracy(plt, history, checkpoint_dir_path_id)
plot_loss(plt, history, checkpoint_dir_path_id)
confusion_matrix(plt, val_ds, model, checkpoint_dir_path_id)