## BERT Classifier for Software & Hardware Jira Summary Tickets
- PRTSIR = Hardware = 1
- ICT = Software = 0

In [0]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

In [0]:
#import libraries
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import keras
from tqdm import tqdm
import pickle
from keras.models import Model
import keras.backend as K
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
import itertools
from keras.models import load_model
from sklearn.utils import shuffle
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_text as text

from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Dropout
from tensorflow.keras.layers import GlobalMaxPooling2D, MaxPooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model

#hide warnings
import warnings
warnings.filterwarnings('ignore')


# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

In [0]:
# Read recipe inputs
# Dataset pharmacy_dataset_reduced renamed to pharmacy_dataset_TEST by vkb6bn on 2023-03-08 17:42:52
data = dataiku.Dataset("PRTSIR_TICKETS_dataset_stacked")
data = data.get_dataframe()
len(data)

In [0]:
data.head()

In [0]:
#drop any missing values
data = data.dropna()
#reset index
data=data.reset_index(drop=True)
#shuffle data
data = shuffle(data)
data.head(3)

In [0]:
#define the number of unique classes under 'label'
num_classes=len(data.label.unique())
num_classes

In [0]:
#define y variable
y = tf.keras.utils.to_categorical(data["label"].values, num_classes=num_classes)
#y

#split data into test and train
x_train, x_test, y_train, y_test = train_test_split(data['text'], y, test_size=0.3)
len(x_train), len(x_test), len(y_train), len(y_test)

In [0]:
#load bert with tensorflow hub
preprocessor = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1")

#function to get word embeddings
def get_embeddings(sentences):
  '''return BERT-like embeddings of input text
  Args:
    - sentences: list of strings
  Output:
    - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
  '''
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']

#test function
#get_embeddings([
   # "This is a test para ver el futuro of the model."])

In [0]:
#resource: https://towardsdatascience.com/multi-label-text-classification-using-bert-and-tensorflow-d2e88d8f488d#98ee
from keras import backend as K

#functions to find recall
def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

#functions to find precision
def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

#functions to find f1 score
def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [0]:
#define model
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

bert_model = tf.keras.Model(i, x)

In [0]:
#define number of epochs
n_epochs = 10

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      tf.keras.metrics.TrueNegatives(),
      tf.keras.metrics.FalsePositives(),
      tf.keras.metrics.AUC(),
      balanced_recall,
      balanced_precision,
      balanced_f1_score
]

#EarlyStopping callback to monitor validation loss
#if metric doesn't improve for at least 3 epochs (patience = 3)
    #training is interrupted and weights from epoch where the validation loss
    #showed the best value (i.e. lowest) are restored (restore_best_weights = True)
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss",
                                                      patience = 3,
                                                      restore_best_weights = True)

In [0]:
#compile model
bert_model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

In [0]:
#fit model
model_fit = bert_model.fit(x_train,
                      y_train,
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback], verbose=1)

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

x = list(range(1, n_epochs+1))
metric_list = list(model_fit.history.keys())
num_metrics = int(len(metric_list)/2)

fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30, 5))

for i in range(0, num_metrics):
  ax[i].plot(x, model_fit.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
  ax[i].plot(x, model_fit.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
  ax[i].set_xlabel("epochs",fontsize=14)
  ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
  ax[i].legend(loc="lower left")

In [0]:
model_fit.history

### Predictions

In [0]:
#Text from Description Column
ICT_text = ["aca control operations software support inform operator time restart bl correlator acacorr observation runinng new instance ict-4841 not find channel average datum inspect log last month corroborate little ict-4841 relate bl restart common point bl aca gns network indeed bl machine reinitializing get image gns:/tftpboot/ aca node show log slow cdp transmission aca master example code 21t09:45:10.427 acc mastercontainer corr_master_comp shutdownsubsyspass1 method call /var log message cob cpn-05 syslogd restart remote reception cob cpn-14 syslogd restart remote reception cob cpn-03 syslogd restart remote reception cob cpn-13 syslogd restart remote reception cob cc syslogd restart remote reception cob cpn-07 syslogd restart remote reception cob cpn-15 syslogd restart remote reception cob cpn-04 syslogd restart remote reception cob cpn-01 syslogd restart remote reception cob cpn-02 syslogd restart remote reception cob cpn-16 syslogd restart remote reception cob cpn-06 syslogd restart remote reception cob cpn-09 syslogd restart remote reception cob cpn-12 syslogd restart remote reception cob cpn-08 syslogd restart remote reception cob cpn-11 syslogd restart remote reception cob cpn-10 syslogd restart remote reception 21t09:51:44.433 acacorr cdpmif cppcontainer acacorr cdpmif master not find channel average datum key[obs/0000002799/2/1 nid[9 file[acacdpmchanaverdataprocthread.cpp line[357 21t09:52:46.999 acacorr cdpmif cppcontainer acacorr cdpmif master not find channel average datum key[obs/0000002818/3/5 nid[5 file[acacdpmchanaverdataprocthread.cpp line[357 21t09:52:47.389 acacorr cdpc_data_mgr n07 cppcontainer acacorr cdpc_data_mgr node_07 cdp datum transmisson long elaps[4.138645(sp=0.000001/0.063965,ch=4.138644/4.138635 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:47.665 acacorr cdpc_data_mgr n32 cppcontainer acacorr cdpc_data_mgr node_32 cdp datum transmisson long elaps[4.418394(sp=0.000001/0.082536,ch=4.418393/4.418384 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:47.849 acacorr cdpc_data_mgr n13 cppcontainer acacorr cdpc_data_mgr node_13 cdp datum transmisson long elaps[4.599679(sp=0.000001/0.086688,ch=4.599678/4.599669 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:48.045 acacorr cdpc_data_mgr n30 cppcontainer acacorr cdpc_data_mgr node_30 cdp datum transmisson long elaps[4.806524(sp=0.000001/0.083923,ch=4.806523/4.806515 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:49.003 acacorr cdpc_data_mgr n22 cppcontainer acacorr cdpc_data_mgr node_22 cdp datum transmisson long elaps[5.766346(sp=0.000001/0.060834,ch=5.766345/5.766336 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:49.003 acacorr cdpc_data_mgr n20 cppcontainer acacorr cdpc_data_mgr node_20 cdp datum transmisson long elaps[5.764070(sp=0.000001/0.039338,ch=5.764069/5.764060 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:49.003 acacorr cdpc_data_mgr n21 cppcontainer acacorr cdpc_data_mgr node_21 cdp datum transmisson long elaps[5.759761(sp=0.000001/0.086281,ch=5.759760/5.759751 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:49.027 acacorr cdpc_data_mgr n06 cppcontainer acacorr cdpc_data_mgr node_06 cdp datum transmisson long elaps[5.771451(sp=0.000001/0.061036,ch=5.771450/5.771441 file[acacdpcdatapublisherthread.cpp line[258 21t09:52:49.707 acacorr cdpc_data_mgr n29 cppcontainer acacorr cdpc_data_mgr node_29 cdp datum transmisson long elaps[5.457301(sp=0.000001/0.058710,ch=5.457300/5.457292 file[acacdpcdatapublisherthread.cpp line[258 code assign ticket initially softops order deeply understand relation apparently separate subsytems bl corr restart time aca cdp datum transmisson long"]
PRTSIR_text = ["Focus SBs BL correction Version 0 failed project started at 3 PM and ended at 3 06 PM project code 0000 0 00201 CSV PI nphillip SchedBlock focused on Band 3 Z ExecBlock has SB UID QA0 status system part of ALMA RB 03 band built on December 22 2014 list of devices connected to system including channel and node information some devices include LO Reference Receiver IF Processor Digitizer Clock and 2nd LO Synthesizer attempt made to turn on FEPS device for antenna DV22 but failed device state changed from Stop to Start but encountered error in process device could not go to Configure State possibly because not responding to SN CanBus request as result antenna out of array"]

In [0]:
#predict class function
def predict_class(test_text):
  '''predict class of input text
  Args:
    - comments (list of strings)
  Output:
    - class (list of int)
  '''
  return [np.argmax(pred) for pred in bert_model.predict(test_text)]

In [0]:
predict_class(ICT_text)

In [0]:
predict_class(PRTSIR_text)