# Data preprocessing

Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
#!pip install --upgrade tensorflow

In [None]:
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, BinaryAccuracy
from tensorflow.keras.utils import to_categorical

import keras.backend as K
# And pandas for data import + sklearn because you allways need sklearn
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
#Specify input path
path = ''

#Specify data filename

filename = 'Moodle - Issues with labels.csv'
project = 'Moodle'

#filename = 'Chrome - Issues with labels.csv'
#project = 'Chrome'

raw = pd.read_csv(path + filename, encoding="ISO-8859-1")

In [None]:
raw = raw.rename(columns={'Issue ID':'ID', 'Issue Summary': 'title', 'Issue Description':'desc'})
TOTAL_ISSUE = len(raw)

TOTAL_ISSUE

In [None]:
#Data pre-processing
if project == 'Chrome':
  print('0')
  raw['desc'] = raw['desc'].replace(r'\n',' ', regex = True)
  raw['desc'] = raw['desc'].replace(r'(?:Chrome Version( *): (\d+\.*)+\d+)', '', regex = True)
  raw['title'] = raw['title'].replace(r'[^\x00-\x7F]', '')
  raw['desc'] = raw['desc'].replace(r'[^\x00-\x7F]', '')
  raw['desc'] = raw['desc'].replace('[^A-Za-z0-9]+', '')
  raw['desc'] = raw['desc'].str.replace(r'^https?:\/\/.*[\r\n]*', ' ')
  raw['title'] = raw['title'].replace(r'\"', '', regex = True)
  raw['desc'] = raw['desc'].replace(r'\"', '', regex = True)
  raw['desc'] = raw['desc'].str.strip()

elif project == 'Moodle':
  print('1')
  raw['desc'].fillna('None')
  raw['desc'] = raw['desc'].replace(r'\n','', regex = True)
  raw['desc'] = raw['desc'].replace('[^A-Za-z0-9]+', ' ')
  raw['desc'] = raw['desc'].str.replace(r'^https?:\/\/.*[\r\n]*', ' ')
  raw['desc'] = raw['desc'].str.strip()

In [None]:
raw

In [None]:
first_label_col = 3
last_label_col = 73
label_cols = raw.columns[first_label_col:last_label_col+1:]

label_cols

In [None]:
import scipy.sparse as sparse

NUM_LABEL = 71

one_hot_labels = raw.loc[:,label_cols]

arr = sparse.coo_matrix(one_hot_labels, shape=(TOTAL_ISSUE,NUM_LABEL))


In [None]:
temp = pd.DataFrame(data=None)

temp['titledesc'] = raw['title'] + '. ' + raw['desc']
temp['one_hot'] = arr.toarray().tolist()

temp = pd.concat([temp, raw[label_cols]], axis=1)

temp

In [None]:
one_freq_train = temp.iloc[[217,390,338,392,271,250,29,151,412,67,98,110,28,13,193,90,288,228,267,61],:]
one_freq_test = temp.iloc[[217,390,338,392,413,351,217,415,452,437,348,159,33,260,337,96,291,234,281,147],:]

In [None]:
temp = temp.drop(temp.index[[217,390,338,392,271,250,29,151,412,67,98,110,28,13,193,90,288,413,351,217,415,452,437,348,159,33,260,337,96,291,228,267,61,234,281,147]])

In [None]:
training_size = 80
test_size = 20

numTrain = int((TOTAL_ISSUE * training_size)/100) - 36
numTest = int((TOTAL_ISSUE * test_size)/100)

data = temp[:numTrain]
data_test = temp[numTrain:-1]

In [None]:
data = pd.concat([data, one_freq_train], ignore_index=True)

data

In [None]:
data_test = pd.concat([data_test, one_freq_test], ignore_index=True)

data_test

In [None]:
# Name of the BERT model to use
model_name = 'bert-base-uncased'

# Max length of tokens
max_length = 120

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

In [None]:
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

# Load the MainLayer
bert = transformer_model.layers[0]

# Build your model input
#input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
#inputs = {'input_ids': input_ids}

#use attention masks to gain performance
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Then build your model output
R1=Dense(units=len(data.R1.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R1')(pooled_output)
R2=Dense(units=len(data.R2.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R2')(pooled_output)
R3=Dense(units=len(data.R3.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R3')(pooled_output)
R4=Dense(units=len(data.R4.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R4')(pooled_output)
R5=Dense(units=len(data.R5.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R5')(pooled_output)
R6=Dense(units=len(data.R6.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R6')(pooled_output)
R7=Dense(units=len(data.R7.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R7')(pooled_output)
R8=Dense(units=len(data.R8.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R8')(pooled_output)
R9=Dense(units=len(data.R9.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R9')(pooled_output)
R10=Dense(units=len(data.R10.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R10')(pooled_output)
R11=Dense(units=len(data.R11.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R11')(pooled_output)
R12=Dense(units=len(data.R12.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R12')(pooled_output)
R13=Dense(units=len(data.R13.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R13')(pooled_output)
R14=Dense(units=len(data.R14.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R14')(pooled_output)
R15=Dense(units=len(data.R15.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R15')(pooled_output)
R16=Dense(units=len(data.R16.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R16')(pooled_output)
R17=Dense(units=len(data.R17.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R17')(pooled_output)
R18=Dense(units=len(data.R18.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R18')(pooled_output)
R19=Dense(units=len(data.R19.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R19')(pooled_output)
R20=Dense(units=len(data.R20.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R20')(pooled_output)
R21=Dense(units=len(data.R21.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R21')(pooled_output)
R22=Dense(units=len(data.R22.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R22')(pooled_output)
R23=Dense(units=len(data.R23.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R23')(pooled_output)
R24=Dense(units=len(data.R24.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R24')(pooled_output)
R25=Dense(units=len(data.R25.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R25')(pooled_output)
R26=Dense(units=len(data.R26.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R26')(pooled_output)
R27=Dense(units=len(data.R27.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R27')(pooled_output)
R28=Dense(units=len(data.R28.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R28')(pooled_output)
R29=Dense(units=len(data.R29.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R29')(pooled_output)
R30=Dense(units=len(data.R30.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R30')(pooled_output)
R31=Dense(units=len(data.R31.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R31')(pooled_output)
R32=Dense(units=len(data.R32.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R32')(pooled_output)
R33=Dense(units=len(data.R33.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R33')(pooled_output)
R34=Dense(units=len(data.R34.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R34')(pooled_output)
R35=Dense(units=len(data.R35.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R35')(pooled_output)
R36=Dense(units=len(data.R36.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R36')(pooled_output)
R37=Dense(units=len(data.R37.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R37')(pooled_output)
R38=Dense(units=len(data.R38.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R38')(pooled_output)
R39=Dense(units=len(data.R39.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R39')(pooled_output)
R40=Dense(units=len(data.R40.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R40')(pooled_output)
R41=Dense(units=len(data.R41.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R41')(pooled_output)
R42=Dense(units=len(data.R42.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R42')(pooled_output)
R43=Dense(units=len(data.R43.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R43')(pooled_output)
R44=Dense(units=len(data.R44.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R44')(pooled_output)
R45=Dense(units=len(data.R45.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R45')(pooled_output)
R46=Dense(units=len(data.R46.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R46')(pooled_output)
R47=Dense(units=len(data.R47.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R47')(pooled_output)
R48=Dense(units=len(data.R48.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R48')(pooled_output)
R49=Dense(units=len(data.R49.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R49')(pooled_output)
R50=Dense(units=len(data.R50.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R50')(pooled_output)
R51=Dense(units=len(data.R51.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R51')(pooled_output)
R52=Dense(units=len(data.R52.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R52')(pooled_output)
R53=Dense(units=len(data.R53.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R53')(pooled_output)
R54=Dense(units=len(data.R54.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R54')(pooled_output)
R55=Dense(units=len(data.R55.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R55')(pooled_output)
R56=Dense(units=len(data.R56.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R56')(pooled_output)
R57=Dense(units=len(data.R57.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R57')(pooled_output)
R58=Dense(units=len(data.R58.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R58')(pooled_output)
R59=Dense(units=len(data.R59.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R59')(pooled_output)
R60=Dense(units=len(data.R60.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R60')(pooled_output)
R61=Dense(units=len(data.R61.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R61')(pooled_output)
R62=Dense(units=len(data.R62.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R62')(pooled_output)
R63=Dense(units=len(data.R63.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R63')(pooled_output)
R64=Dense(units=len(data.R64.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R64')(pooled_output)
R65=Dense(units=len(data.R65.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R65')(pooled_output)
R66=Dense(units=len(data.R66.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R66')(pooled_output)
R67=Dense(units=len(data.R67.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R67')(pooled_output)
R68=Dense(units=len(data.R68.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R68')(pooled_output)
R69=Dense(units=len(data.R69.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R69')(pooled_output)
R70=Dense(units=len(data.R70.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R70')(pooled_output)
R71=Dense(units=len(data.R71.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='R71')(pooled_output)


#outputs = {'cat1': cat1, 'cat2': cat2, 'cat3': cat3, 'cat4': cat4, 'cat5': cat5, 'cat6': cat6}
outputs = {'R1': R1,
'R2': R2,
'R3': R3,
'R4': R4,
'R5': R5,
'R6': R6,
'R7': R7,
'R8': R8,
'R9': R9,
'R10': R10,
'R11': R11,
'R12': R12,
'R13': R13,
'R14': R14,
'R15': R15,
'R16': R16,
'R17': R17,
'R18': R18,
'R19': R19,
'R20': R20,
'R21': R21,
'R22': R22,
'R23': R23,
'R24': R24,
'R25': R25,
'R26': R26,
'R27': R27,
'R28': R28,
'R29': R29,
'R30': R30,
'R31': R31,
'R32': R32,
'R33': R33,
'R34': R34,
'R35': R35,
'R36': R36,
'R37': R37,
'R38': R38,
'R39': R39,
'R40': R40,
'R41': R41,
'R42': R42,
'R43': R43,
'R44': R44,
'R45': R45,
'R46': R46,
'R47': R47,
'R48': R48,
'R49': R49,
'R50': R50,
'R51': R51,
'R52': R52,
'R53': R53,
'R54': R54,
'R55': R55,
'R56': R56,
'R57': R57,
'R58': R58,
'R59': R59,
'R60': R60,
'R61': R61,
'R62': R62,
'R63': R63,
'R64': R64,
'R65': R65,
'R66': R66,
'R67': R67,
'R68': R68,
'R69': R69,
'R70': R70,
'R71': R71}

# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

# Take a look at the model
model.summary()


In [None]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = BinaryCrossentropy(from_logits = False), 
    metrics = [BinaryAccuracy(), precision_m, recall_m, f1_m])
    #loss = CategoricalCrossentropy(from_logits = True), 
    #metrics = [CategoricalAccuracy(), precision_m, recall_m, f1_m])

# Ready output data for the model
y_R1=to_categorical(data['R1'])
y_R2=to_categorical(data['R2'])
y_R3=to_categorical(data['R3'])
y_R4=to_categorical(data['R4'])
y_R5=to_categorical(data['R5'])
y_R6=to_categorical(data['R6'])
y_R7=to_categorical(data['R7'])
y_R8=to_categorical(data['R8'])
y_R9=to_categorical(data['R9'])
y_R10=to_categorical(data['R10'])
y_R11=to_categorical(data['R11'])
y_R12=to_categorical(data['R12'])
y_R13=to_categorical(data['R13'])
y_R14=to_categorical(data['R14'])
y_R15=to_categorical(data['R15'])
y_R16=to_categorical(data['R16'])
y_R17=to_categorical(data['R17'])
y_R18=to_categorical(data['R18'])
y_R19=to_categorical(data['R19'])
y_R20=to_categorical(data['R20'])
y_R21=to_categorical(data['R21'])
y_R22=to_categorical(data['R22'])
y_R23=to_categorical(data['R23'])
y_R24=to_categorical(data['R24'])
y_R25=to_categorical(data['R25'])
y_R26=to_categorical(data['R26'])
y_R27=to_categorical(data['R27'])
y_R28=to_categorical(data['R28'])
y_R29=to_categorical(data['R29'])
y_R30=to_categorical(data['R30'])
y_R31=to_categorical(data['R31'])
y_R32=to_categorical(data['R32'])
y_R33=to_categorical(data['R33'])
y_R34=to_categorical(data['R34'])
y_R35=to_categorical(data['R35'])
y_R36=to_categorical(data['R36'])
y_R37=to_categorical(data['R37'])
y_R38=to_categorical(data['R38'])
y_R39=to_categorical(data['R39'])
y_R40=to_categorical(data['R40'])
y_R41=to_categorical(data['R41'])
y_R42=to_categorical(data['R42'])
y_R43=to_categorical(data['R43'])
y_R44=to_categorical(data['R44'])
y_R45=to_categorical(data['R45'])
y_R46=to_categorical(data['R46'])
y_R47=to_categorical(data['R47'])
y_R48=to_categorical(data['R48'])
y_R49=to_categorical(data['R49'])
y_R50=to_categorical(data['R50'])
y_R51=to_categorical(data['R51'])
y_R52=to_categorical(data['R52'])
y_R53=to_categorical(data['R53'])
y_R54=to_categorical(data['R54'])
y_R55=to_categorical(data['R55'])
y_R56=to_categorical(data['R56'])
y_R57=to_categorical(data['R57'])
y_R58=to_categorical(data['R58'])
y_R59=to_categorical(data['R59'])
y_R60=to_categorical(data['R60'])
y_R61=to_categorical(data['R61'])
y_R62=to_categorical(data['R62'])
y_R63=to_categorical(data['R63'])
y_R64=to_categorical(data['R64'])
y_R65=to_categorical(data['R65'])
y_R66=to_categorical(data['R66'])
y_R67=to_categorical(data['R67'])
y_R68=to_categorical(data['R68'])
y_R69=to_categorical(data['R69'])
y_R70=to_categorical(data['R70'])
y_R71=to_categorical(data['R71'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=data['titledesc'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


In [None]:
# Fit the model
history = model.fit(
    #x={'input_ids': x['input_ids']},
    x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    y={'R1': y_R1,
        'R2': y_R2,
        'R3': y_R3,
        'R4': y_R4,
        'R5': y_R5,
        'R6': y_R6,
        'R7': y_R7,
        'R8': y_R8,
        'R9': y_R9,
        'R10': y_R10,
        'R11': y_R11,
        'R12': y_R12,
        'R13': y_R13,
        'R14': y_R14,
        'R15': y_R15,
        'R16': y_R16,
        'R17': y_R17,
        'R18': y_R18,
        'R19': y_R19,
        'R20': y_R20,
        'R21': y_R21,
        'R22': y_R22,
        'R23': y_R23,
        'R24': y_R24,
        'R25': y_R25,
        'R26': y_R26,
        'R27': y_R27,
        'R28': y_R28,
        'R29': y_R29,
        'R30': y_R30,
        'R31': y_R31,
        'R32': y_R32,
        'R33': y_R33,
        'R34': y_R34,
        'R35': y_R35,
        'R36': y_R36,
        'R37': y_R37,
        'R38': y_R38,
        'R39': y_R39,
        'R40': y_R40,
        'R41': y_R41,
        'R42': y_R42,
        'R43': y_R43,
        'R44': y_R44,
        'R45': y_R45,
        'R46': y_R46,
        'R47': y_R47,
        'R48': y_R48,
        'R49': y_R49,
        'R50': y_R50,
        'R51': y_R51,
        'R52': y_R52,
        'R53': y_R53,
        'R54': y_R54,
        'R55': y_R55,
        'R56': y_R56,
        'R57': y_R57,
        'R58': y_R58,
        'R59': y_R59,
        'R60': y_R60,
        'R61': y_R61,
        'R62': y_R62,
        'R63': y_R63,
        'R64': y_R64,
        'R65': y_R65,
        'R66': y_R66,
        'R67': y_R67,
        'R68': y_R68,
        'R69': y_R69,
        'R70': y_R70,
        'R71': y_R71},
    validation_split=0.2,
    batch_size=64,
    epochs=60)


In [None]:
#save training results
history_df = pd.DataFrame(history.history)

#set file name
history_df.to_csv('Moodle-req-history.csv')

#save file - need to specify output path
!cp Moodle-req-history.csv 'path+filename'

In [None]:
# Ready test data
test_y_R1=to_categorical(data_test['R1'])
test_y_R2=to_categorical(data_test['R2'])
test_y_R3=to_categorical(data_test['R3'])
test_y_R4=to_categorical(data_test['R4'])
test_y_R5=to_categorical(data_test['R5'])
test_y_R6=to_categorical(data_test['R6'])
test_y_R7=to_categorical(data_test['R7'])
test_y_R8=to_categorical(data_test['R8'])
test_y_R9=to_categorical(data_test['R9'])
test_y_R10=to_categorical(data_test['R10'])
test_y_R11=to_categorical(data_test['R11'])
test_y_R12=to_categorical(data_test['R12'])
test_y_R13=to_categorical(data_test['R13'])
test_y_R14=to_categorical(data_test['R14'])
test_y_R15=to_categorical(data_test['R15'])
test_y_R16=to_categorical(data_test['R16'])
test_y_R17=to_categorical(data_test['R17'])
test_y_R18=to_categorical(data_test['R18'])
test_y_R19=to_categorical(data_test['R19'])
test_y_R20=to_categorical(data_test['R20'])
test_y_R21=to_categorical(data_test['R21'])
test_y_R22=to_categorical(data_test['R22'])
test_y_R23=to_categorical(data_test['R23'])
test_y_R24=to_categorical(data_test['R24'])
test_y_R25=to_categorical(data_test['R25'])
test_y_R26=to_categorical(data_test['R26'])
test_y_R27=to_categorical(data_test['R27'])
test_y_R28=to_categorical(data_test['R28'])
test_y_R29=to_categorical(data_test['R29'])
test_y_R30=to_categorical(data_test['R30'])
test_y_R31=to_categorical(data_test['R31'])
test_y_R32=to_categorical(data_test['R32'])
test_y_R33=to_categorical(data_test['R33'])
test_y_R34=to_categorical(data_test['R34'])
test_y_R35=to_categorical(data_test['R35'])
test_y_R36=to_categorical(data_test['R36'])
test_y_R37=to_categorical(data_test['R37'])
test_y_R38=to_categorical(data_test['R38'])
test_y_R39=to_categorical(data_test['R39'])
test_y_R40=to_categorical(data_test['R40'])
test_y_R41=to_categorical(data_test['R41'])
test_y_R42=to_categorical(data_test['R42'])
test_y_R43=to_categorical(data_test['R43'])
test_y_R44=to_categorical(data_test['R44'])
test_y_R45=to_categorical(data_test['R45'])
test_y_R46=to_categorical(data_test['R46'])
test_y_R47=to_categorical(data_test['R47'])
test_y_R48=to_categorical(data_test['R48'])
test_y_R49=to_categorical(data_test['R49'])
test_y_R50=to_categorical(data_test['R50'])
test_y_R51=to_categorical(data_test['R51'])
test_y_R52=to_categorical(data_test['R52'])
test_y_R53=to_categorical(data_test['R53'])
test_y_R54=to_categorical(data_test['R54'])
test_y_R55=to_categorical(data_test['R55'])
test_y_R56=to_categorical(data_test['R56'])
test_y_R57=to_categorical(data_test['R57'])
test_y_R58=to_categorical(data_test['R58'])
test_y_R59=to_categorical(data_test['R59'])
test_y_R60=to_categorical(data_test['R60'])
test_y_R61=to_categorical(data_test['R61'])
test_y_R62=to_categorical(data_test['R62'])
test_y_R63=to_categorical(data_test['R63'])
test_y_R64=to_categorical(data_test['R64'])
test_y_R65=to_categorical(data_test['R65'])
test_y_R66=to_categorical(data_test['R66'])
test_y_R67=to_categorical(data_test['R67'])
test_y_R68=to_categorical(data_test['R68'])
test_y_R69=to_categorical(data_test['R69'])
test_y_R70=to_categorical(data_test['R70'])
test_y_R71=to_categorical(data_test['R71'])

test_x = tokenizer(
    text=data_test['titledesc'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids'], 'attention_mask': test_x['attention_mask']},
    y={'R1': test_y_R1,
        'R2': test_y_R2,
        'R3': test_y_R3,
        'R4': test_y_R4,
        'R5': test_y_R5,
        'R6': test_y_R6,
        'R7': test_y_R7,
        'R8': test_y_R8,
        'R9': test_y_R9,
        'R10': test_y_R10,
        'R11': test_y_R11,
        'R12': test_y_R12,
        'R13': test_y_R13,
        'R14': test_y_R14,
        'R15': test_y_R15,
        'R16': test_y_R16,
        'R17': test_y_R17,
        'R18': test_y_R18,
        'R19': test_y_R19,
        'R20': test_y_R20,
        'R21': test_y_R21,
        'R22': test_y_R22,
        'R23': test_y_R23,
        'R24': test_y_R24,
        'R25': test_y_R25,
        'R26': test_y_R26,
        'R27': test_y_R27,
        'R28': test_y_R28,
        'R29': test_y_R29,
        'R30': test_y_R30,
        'R31': test_y_R31,
        'R32': test_y_R32,
        'R33': test_y_R33,
        'R34': test_y_R34,
        'R35': test_y_R35,
        'R36': test_y_R36,
        'R37': test_y_R37,
        'R38': test_y_R38,
        'R39': test_y_R39,
        'R40': test_y_R40,
        'R41': test_y_R41,
        'R42': test_y_R42,
        'R43': test_y_R43,
        'R44': test_y_R44,
        'R45': test_y_R45,
        'R46': test_y_R46,
        'R47': test_y_R47,
        'R48': test_y_R48,
        'R49': test_y_R49,
        'R50': test_y_R50,
        'R51': test_y_R51,
        'R52': test_y_R52,
        'R53': test_y_R53,
        'R54': test_y_R54,
        'R55': test_y_R55,
        'R56': test_y_R56,
        'R57': test_y_R57,
        'R58': test_y_R58,
        'R59': test_y_R59,
        'R60': test_y_R60,
        'R61': test_y_R61,
        'R62': test_y_R62,
        'R63': test_y_R63,
        'R64': test_y_R64,
        'R65': test_y_R65,
        'R66': test_y_R66,
        'R67': test_y_R67,
        'R68': test_y_R68,
        'R69': test_y_R69,
        'R70': test_y_R70,
        'R71': test_y_R71}
)

In [None]:
#save evaluation results
model_eval_df = pd.DataFrame(model_eval)

#set file name
model_eval_df.to_csv('Moodle-req-model_eval.csv')

#save file - need to specify output path
!cp Moodle-req-model_eval.csv 'path+filename'

Working on y_pred_onehot

In [None]:
pred_model = model.predict({'input_ids': test_x['input_ids'], 'attention_mask': test_x['attention_mask']})

In [None]:
column_names = label_cols.to_list()
pred_by_issue = pd.DataFrame(data=None, columns=column_names)

#pred_by_issue = pd.DataFrame(data=None, columns=['y_pred_label'])

for i in range(len(data_test)):
  cls_list = []
  for req in label_cols:
    pred_cls = np.argmax(pred_model[req][i], axis = 0)
    #print(i, req, pred_cls)
    cls_list.append(pred_cls)
  
  pred_by_issue.loc[i] = cls_list
  
#pred_by_issue.loc[i] = pd.DataFrame(np.array(cls_list).reshape(1,71), columns = column_names)
#pred_by_issue.loc[i] = pd.DataFrame(cls_list)
#pred_by_issue.loc[i] = pd.DataFrame(np.array(cls_list).reshape(1,1), columns = column_names)

In [None]:
pred_by_issue

In [None]:
y_pred_onehot = pred_by_issue.to_numpy(dtype=np.int64)

In [None]:
y_pred_onehot[0]

Working on MRR

In [None]:
def evaluate_mrr(data_act, data_est):

  #actual = to_categorical(data_act)
  actual = data_act
  estimate = data_est

  sum_rr = 0.0
  m = len(actual)
  for j in range(len(actual)):
        y_true = np.argwhere(actual[j])
        y_pred = estimate[j].argsort()[::-1][:]
        print('y_true - y_pred: ' , y_true, y_pred)
        ranks = []
        for idx, i in enumerate(y_pred):
            if i in y_true:
                ranks.append(idx + 1)
        if len(ranks) > 0:
            first = ranks[0]
            rr = 1 / float(first)
            sum_rr += rr

  mrr_score = sum_rr / float(m)
  return mrr_score

In [None]:
mrr = evaluate_mrr(data_test['one_hot'], y_pred_onehot)

In [None]:
mrr

In [None]:
mrr_data_df = pd.DataFrame(data=None)

mrr_data_df['mrr'] = mrr

mrr_data_df.to_csv('Moodle-req-mrr3.csv')
!cp Moodle-req-mrr3.csv '/content/drive/MyDrive/UOW/Kookai - PhD Resources/Colab Notebooks/Issue classification - BERT/Data/'

Working on Recall@k

In [None]:
def evaluate_prepare_y(actual, estimate, k):
  y_true = np.argwhere(actual)
  y_pred = estimate.argsort()[-k:]

  return y_true, y_pred

In [None]:
def evaluate_recall_at_k(data_act, data_est):
  recall_at_k = []
  actual = data_act
  estimate = data_est

  req_num = NUM_LABEL
  #print(req_num)

  for k in range(req_num):
    print((k+1))
    sum_recall = 0.0
    for j in range(len(actual)):
      y_true, y_pred = evaluate_prepare_y(actual[j], estimate[j], (k + 1))
      relevant = (y_true == y_pred).sum()
      total_relevant = len(y_true)
      sum_recall = sum_recall + (float(relevant) / total_relevant)
    
    recall_at_k.append('%.4f' % (sum_recall / len(actual)))
    print('Recall@K: {:.4f}'.format(sum_recall / len(actual)))

  return recall_at_k


In [None]:
recall_val = evaluate_recall_at_k(data_test['one_hot'], y_pred_onehot)

In [None]:
recall_df = pd.DataFrame(data=None)

recall_df['mrr'] = recall_val

#set file name
recall_df.to_csv('Moodle-req-recall.csv')

#save file - need to specify output path
!cp Moodle-req-recall.csv 'path+filename'