# Training



In [None]:
!pip install tensorflow_text
!pip install pandas

In [None]:
%tensorflow_version 2.x

import tensorflow as tf
import tensorflow_text as text
print(tf.__version__)
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found {}'.format(device_name))
print('Found GPU at: {}'.format(device_name))

In [None]:
# connect to drive if in Colab
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive

In [None]:
import pandas as pd

# read csv
og_data = pd.read_csv('rule_based_contracts_v1.csv', dtype='str')
og_data.head()

In [None]:
# rm uncategorized rows from dataset so it can be used for training
training_data = og_data[og_data['category'].notna()]

# rm rows with no description for training
training_data = training_data[training_data['description_en'].notna()]

# minimize
training_data = training_data[['category', 'description_en']].copy()

In [None]:
# checking current distribution 
# wow it's uneven lol this may cause problems
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')


num_classes = len(training_data["category"].value_counts())

colors = plt.cm.Dark2(np.linspace(0, 1, num_classes))
iter_color = iter(colors)

training_data["category"].value_counts().plot.barh(title="Reviews for each topic (n, %)", 
                                                 ylabel="Topics",
                                                 color=colors,
                                                 figsize=(9,9))

for i, v in enumerate(training_data["category"].value_counts()):
  c = next(iter_color)
  plt.text(v, i,
           " "+str(v)+", "+str(round(v*100/training_data.shape[0],2))+"%", 
           color=c, 
           va='center', 
           fontweight='bold')

In [None]:
# going to redistrubute dataset so it's 1) more equal and 2) smaller aka trains faster 
# each category will get 3000 entries-- ensure all unique descriptions are present once, then random sample for remainder
# 3000 chosen bc the highest number of unique desc is 2790 (`3_information_technology`)

redist_dfs = []
for i, c in enumerate(training_data['category'].drop_duplicates()):
    df = training_data.loc[training_data['category'] == c]
    df_unique = df.drop_duplicates(subset=['description_en']) # unique descriptions
    
    # `11_defence` is super tiny so it needs a special case to allow repeat sampling
    # if c == '11_defence':
    #     df_random = df.sample(n=(5000-len(df_unique)), replace=True, random_state=1)
    # else:
    #     df_random = df.sample(n=(5000-len(df_unique)), random_state=1)
    
    df_random = df.sample(n=(7000-len(df_unique)), replace=True, random_state=1)

    df = pd.concat([df_unique, df_random])
    redist_dfs.append(df)

In [None]:
# verifying success
training_data = pd.concat(redist_dfs)
training_data['category'].value_counts()

In [None]:
# map categories to numeric values
training_data['label'] = training_data['category'].map({'1_facilities_and_construction': 0,
                                            '2_professional_services': 1,
                                            '3_information_technology': 2,
                                            '4_medical': 3,
                                            '5_transportation_and_logistics': 4,
                                            '6_industrial_products_and_services': 5,
                                            '7_travel': 6,
                                            '8_security_and_protection': 7,
                                            '9_human_capital': 8,
                                            '10_office_management': 9,
                                            '11_defence': 10})

training_data = training_data[['description_en', 'category', 'label']].copy()
training_data = training_data.sample(frac=1, random_state=2).reset_index(drop=True)

training_data.head()

In [None]:
# split data into test/train
import tensorflow as tf
from sklearn.model_selection import train_test_split

y = tf.keras.utils.to_categorical(training_data['label'].values, num_classes=num_classes)

x_train, x_test, y_train, y_test = train_test_split(training_data['description_en'], y, test_size=0.25)

In [None]:
# creating embeddings
import tensorflow_hub as hub
import tensorflow_text as text

preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/en-base/1")


def get_embeddings(sentences):
  '''return BERT-like embeddings of input text
  Args:
    - sentences: list of strings
  Output:
    - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
  '''
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']


get_embeddings([
    "Human resources services"]
)

In [None]:
# finetuning functions
from keras import backend as K

from keras.callbacks import ModelCheckpoint

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
# define preprocess + encoding layers
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model = tf.keras.Model(i, x)

In [None]:
# actual finetuning 
n_epochs = 30

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      balanced_recall,
      balanced_precision,
      balanced_f1_score
]

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

filepath="n1-weights-improvement-{epoch:02d}-{balanced_f1_score:.2f}.hdf5"

checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='balanced_f1_score', verbose=1, save_best_only=True, mode='max')


model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

model_fit = model.fit(x_train, 
                      y_train, 
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback, checkpoint])

In [None]:
x = list(range(1, n_epochs+1))
metric_list = list(model_fit.history.keys())
num_metrics = int(len(metric_list)/2)

fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30, 5))

for i in range(0, num_metrics):
  ax[i].plot(x, model_fit.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
  ax[i].plot(x, model_fit.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
  ax[i].set_xlabel("epochs",fontsize=14)
  ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
  ax[i].legend(loc="lower left")

In [None]:
# test prediction with strongest model
test_descs = ["Other business services not elsewhere",
              "DIGITAL COMMUNICATIONS EQUIPMENT",
              "Non-public servant travel - Support core"]

In [None]:
def predict_class(test_descs):
  '''predict class of input text
  Args:
    - reviews (list of strings)
  Output:
    - class (list of int)
  '''
  return [np.argmax(pred) for pred in model.predict(test_descs)]

predict_class(test_descs)

In [None]:
model.save("BEST-n1-weights-improvement-22-0.92.hdf5")


## Validation

In [None]:
# need to reload the custom layers so they can be passed to our model on load
# finetuning functions
from keras import backend as K

from keras.callbacks import ModelCheckpoint

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
from tensorflow import keras
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd
import numpy as np

# load dataset for creating test_set
og_data = pd.read_csv('rule_based_contracts_v1.csv', dtype='str')


# load model
model = keras.models.load_model(("BEST-n1-weights-improvement-22-0.92.hdf5"), custom_objects={'KerasLayer':hub.KerasLayer,
                                                                                                 'balanced_recall': balanced_recall, 
                                                                                                 'balanced_precision': balanced_precision, 
                                                                                                 'balanced_f1_score': balanced_f1_score})

In [None]:
class_keys = {'1_facilities_and_construction': 0,
              '2_professional_services': 1,
              '3_information_technology': 2,
              '4_medical': 3,
              '5_transportation_and_logistics': 4,
              '6_industrial_products_and_services': 5,
              '7_travel': 6,
              '8_security_and_protection': 7,
              '9_human_capital': 8,
              '10_office_management': 9,
              '11_defence': 10}

In [None]:
# creating a test set
no_class = og_data.loc[og_data['category'].isna() & og_data['economic_object_code'].isna()]

test_set = no_class[no_class['description_en'].notna()].sample(n=15)


In [None]:
# minimize
test_set = test_set[['category', 'description_en']].copy().reset_index(drop=True)
test_set

In [None]:
for i, pred in enumerate(model.predict(test_set['description_en'])):
  # np.max(pred) * 100
  print(pred)
  print(np.argmax(pred))
  print(np.argsort(pred, axis=0)[-2])
  test_set.loc[i, 'category'] = list(class_keys)[np.argmax(pred)]
  test_set.loc[i, 'alt_category'] = list(class_keys)[np.argsort(pred, axis=0)[-2]]
  test_set.loc[i, 'alt_confidence'] = np.partition(pred.flatten(), -2)[-2]
  test_set.loc[i, 'num_category'] = np.argmax(pred)
  test_set.loc[i, 'confidence'] = np.max(pred) * 100
  print('-----------------')


In [None]:
test_set

In [None]:
test_set.describe()
