### BERT Model to Classify Downtimes by Subject/Comments - Multi-Class Classification
- kernel = env mypython
- text cleaning / preparation done before this step
- Resources: https://towardsdatascience.com/multi-label-text-classification-using-bert-and-tensorflow-d2e88d8f488d

In [0]:
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

In [0]:
#import libraries
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import keras
from tqdm import tqdm
import pickle
from keras.models import Model
import keras.backend as K
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
import itertools
from keras.models import load_model
from sklearn.utils import shuffle
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_text as text

from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Dropout
from tensorflow.keras.layers import GlobalMaxPooling2D, MaxPooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model


# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

#hide warnings
import warnings
warnings.filterwarnings('ignore')

#### Load Data

In [0]:
# Read recipe inputs
data = dataiku.Dataset("DowntimeSubjects_prepared")
data = data.get_dataframe()
data.head()

In [0]:
len(data)

#### Setup Data

In [0]:
#reset index
data=data.reset_index(drop=True) 
#shuffle data
data = shuffle(data)   
data.head(5)

In [0]:
#define the number of unique classes under 'label'
num_classes=len(data.DOWNTIME_TYPE.unique())
num_classes

In [0]:
#change label words to numbers: Technical = 0, Weather = 1, Schedulin = 2

# Use the loc accessor to replace 'Scheduling' with 2, 'Technical' with 0, and 'Weather' with 1
data.loc[data['DOWNTIME_TYPE'] == 'Scheduling', 'DOWNTIME_TYPE'] = 2
data.loc[data['DOWNTIME_TYPE'] == 'Technical', 'DOWNTIME_TYPE'] = 0
data.loc[data['DOWNTIME_TYPE'] == 'Weather', 'DOWNTIME_TYPE'] = 1

# Print the updated dataframe
print(data)

#### Split Data - Training & Testing

In [0]:
#define y variable
y = tf.keras.utils.to_categorical(data["DOWNTIME_TYPE"].values, num_classes=num_classes)
#y

In [0]:
#split data into test and train
x_train, x_test, y_train, y_test = train_test_split(data['SUBJECT_cleaned'], y, test_size=0.3)
len(x_train), len(x_test), len(y_train), len(y_test)

### Data Modeling
#### Load BERT with TensorFlow Hub
- repository of trained machine learning models
- use universal-sentence-encoder-cmlm/multilingual-base = universal sentence encoder (100+ languages)
    - uses conditional masked language model
- Goal: turn text into high-dim vectors that capture sentence-level semantics
    - get embeddings from input text with preprocessor and encoder

In [0]:
#load bert with tensorflow hub
preprocessor = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1")

#function to get word embeddings
def get_embeddings(sentences):
  '''return BERT-like embeddings of input text
  Args:
    - sentences: list of strings
  Output:
    - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
  '''
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']

#test function
#get_embeddings([
   # "This is a test para ver el futuro of the model."])

#### Create & Train Classification Model
- Observe different metrics during training: Precision, Recall, F1 Score

In [0]:
#resource: https://towardsdatascience.com/multi-label-text-classification-using-bert-and-tensorflow-d2e88d8f488d#98ee
from keras import backend as K

#functions to find recall
def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

#functions to find precision
def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

#functions to find f1 score
def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

#### Define Model
- preporcessor & encoder layers
- dropout & dense layer with softmax activation function
- output space dimension = # of classes 

In [0]:
#define model
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='SUBJECT_cleaned')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

bert_model = tf.keras.Model(i, x)

#### Compile & Fit Model

In [0]:
#define number of epochs
n_epochs = 1

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      tf.keras.metrics.TrueNegatives(),
      tf.keras.metrics.FalsePositives(),
      tf.keras.metrics.AUC(),
      balanced_recall,
      balanced_precision,
      balanced_f1_score
]

#EarlyStopping callback to monitor validation loss
#if metric doesn't improve for at least 3 epochs (patience = 3)
    #training is interrupted and weights from epoch where the validation loss 
    #showed the best value (i.e. lowest) are restored (restore_best_weights = True)
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

#compile model
bert_model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

#fit model
model = bert_model.fit(x_train, 
                      y_train, 
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback], verbose=1)

In [0]:
model.history

#### Plot Results

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
x = list(range(1, n_epochs+1))
metric_list = list(model.history.keys())
num_metrics = int(len(metric_list)/2)

fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30, 5))

for i in range(0, num_metrics):
  ax[i].plot(x, model.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
  ax[i].plot(x, model.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
  ax[i].set_xlabel("epochs",fontsize=14)
  ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
  ax[i].legend(loc="lower left")

#### Predictions

##### Known results - text from dataset
- technical (0) - "pr1 bl aos sbex fail operation timeout" 
- technical (0) - "caimap setting add pms antenna" 
- weather (1) - "snow camera minute far" 
- weather (1) - "no observation pwr very inastable night" 
- scheduling (2) - "waiting min right time start time constrained project 2017.a.00035.s/3fgl_j04_a_06_tm1" 
- scheduling (2) - "nothing dsa observe aca7" 
- scheduling (2) - "no project observe"

##### Known results - easier - text from dataset
- weather (1) - "high wind aos"
- technical (0) - "bl"
- weather (1) - "poor weather condition observin",
- technical (0) - "pr bl aos acs no response container not tell no conn manager",
- scheduling (2) - "no project",
- scheduling (2) - "shift ending",
- weather (1) - "wvr mm high humidity"

##### Unknown results - text from JIRA 'summary' columns 
- "aos bl dv17 fe not lock band6"
- "qa0 runnig aos check aqua produce error"
- "aos archive subsytem went error"

##### Unknown results - text from JIRA 'description' columns 
- "nogo version --success-- start end project code 0000.0.00258.csv pi nphillip schedblock gonogo b3 execblock uid://a002 xaf4574 x3 sb uid uid://a002 x91a40d x5b qa0 status band alma_rb_03 alma build 201508-cycle3-on b-2016 array array001 array corr m]/64-antenna da42 axis go shutdown run sbex no apparent reason recovered send autonomous"
- "run interactive sb pm01 pm04 suddenly observation crash subscan not end scan checking log find acacorr cmd_transfer cppcontainer gl detected hardware failure"
- "no way create array antenna steve try create array include da45 container creation give timeout lose available antenna create new array"

In [0]:
#ANSWER: 0, 0, 1, 1, 2, 2, 2
known_test_text = ["pr1 bl aos sbex fail operation timeout", 
             "caimap setting add pms antenna",
             "snow camera minute far",
             "no observation pwr very inastable night",
             "waiting min right time start time constrained project 2017.a.00035.s/3fgl_j04_a_06_tm1",
             "nothing dsa observe aca7",
             "no project observe"]

In [0]:
#predict class function
def predict_class(test_text):
  '''predict class of input text
  Args:
    - comments (list of strings)
  Output:
    - class (list of int)
  '''
  return [np.argmax(pred) for pred in bert_model.predict(test_text)]


In [0]:
predict_class(known_test_text)

In [0]:
#ANSWER: 1, 0, 1, 0, 2, 2, 1
easier_known_test = ["high wind aos",
                     "bl",
                     "poor weather condition observin",
                     "pr bl aos acs no response container not tell no conn manager",
                     "no project",
                     "shift ending",
                     "wvr mm high humidity"]

In [0]:
predict_class(easier_known_test)

In [0]:
unknown_test_text = ["aos bl dv17 fe not lock band6",
                     "qa0 runnig aos check aqua produce error",
                     "aos archive subsytem went error",
                     "nogo version --success-- start end project code 0000.0.00258.csv pi nphillip schedblock gonogo b3 execblock uid://a002 xaf4574 x3 sb uid uid://a002 x91a40d x5b qa0 status band alma_rb_03 alma build 201508-cycle3-on b-2016 array array001 array corr m]/64-antenna da42 axis go shutdown run sbex no apparent reason recovered send autonomous",
                     "run interactive sb pm01 pm04 suddenly observation crash subscan not end scan checking log find acacorr cmd_transfer cppcontainer gl detected hardware failure",
                     "no way create array antenna steve try create array include da45 container creation give timeout lose available antenna create new array"]

In [0]:
predict_class(unknown_test_text)

In [0]:
# Make predictions using the trained model
predictions = bert_model.predict(test_text)

In [0]:
predictions

#### Save model in Dataiku

In [0]:
#model.save('DowntimeTypes_BERT_Model.h5')
