# Fine-tuning BERT model for classifying bug ticket severity

In [2]:
# torch
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

# BERT
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

# progress bar displayed while training
from tqdm import tqdm

import pandas as pd
import numpy as np

# confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

1
NVIDIA GeForce RTX 3050 Ti Laptop GPU
cuda


In [4]:
file_path = "bug_tickets.csv"
data = pd.read_csv(file_path)

data.shape
data.head()

Unnamed: 0.1,Unnamed: 0,summary,severity,keywords,status,description,days_distance,created_year,updated_year,created_month,...,created_trimester,updated_trimester,created_weekday,updated_weekday,created_day_of_trimester,updated_day_of_trimester,assignee_freq,reporter_freq,product_freq,component_freq
0,0,remove INTL_ConvertCharset because it is unused,2,defect,0,Created attachment 384671 remove INTL_ConvertC...,58,9,9,6,...,2,3,5,4,84,51,0.001626,0.002184,0.005565,0.000268
1,1,fix compiler warnings in c-sdk/ldap,2,defect,0,Created attachment 384672 changes comm-central...,103,9,9,6,...,2,4,5,3,84,4,0.001626,0.002184,0.000131,0.000113
2,2,Existing tab is overwritten when opening new m...,2,defect,1,User-Agent: Mozilla/5.0 (Windows; U; Windows N...,4857,9,22,6,...,2,4,5,1,84,10,0.179916,2.9e-05,0.026742,0.001595
3,3,"Compose window will not open when using ""send ...",3,defect,0,User-Agent: Mozilla/5.0 (Windows; U; Windows N...,0,9,9,6,...,2,2,5,5,84,84,0.179916,1.4e-05,0.026742,0.002188
4,4,drag of message from Search Messages fails wit...,3,"regression, defect",0,Error: GetSelectedMessages is not defined Sour...,82,9,9,6,...,2,3,5,3,84,75,0.000446,0.001049,0.005565,0.00353


In [5]:
label_mapping = {
    1: 'feature',
    2: 'minor',
    3: 'normal',
    0: 'critical',
}
data['severity'] = data['severity'].map(label_mapping)

data['text'] = (
    data['summary'].fillna('') + ' ' +
    data['keywords'].fillna('') + ' ' +
    data['description'].fillna('')
)
data[['text', 'severity']]

Unnamed: 0,text,severity
0,remove INTL_ConvertCharset because it is unuse...,minor
1,fix compiler warnings in c-sdk/ldap defect Cre...,minor
2,Existing tab is overwritten when opening new m...,minor
3,"Compose window will not open when using ""send ...",normal
4,drag of message from Search Messages fails wit...,normal
...,...,...
70805,Unable to download programmatically from the T...,normal
70806,Exception in diff view defect As I now grep th...,normal
70807,Focused item of autocomplete menu in diff view...,normal
70808,_libpq_pathname in postgres_backend.py should ...,normal


In [6]:
data['severity'].value_counts()

severity
normal      40783
minor       13048
feature     11169
critical     5810
Name: count, dtype: int64

In [7]:
possible_labels = data['severity'].unique();
label_dict = {'feature':0, 'minor':1, 'normal':2, 'critical':3}

data['label'] = data['severity'].replace(label_dict)

X_train, X_val, y_train, y_val = train_test_split(data.index.values,
                                                  data['label'].values,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=data['label'].values)

data['data_type'] = ['not_set']*data.shape[0]

data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

data = data[['severity','label','data_type','text']]
data.groupby(['severity', 'label', 'data_type']).count()

  data['label'] = data['severity'].replace(label_dict)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
severity,label,data_type,Unnamed: 3_level_1
critical,3,train,4648
critical,3,val,1162
feature,0,train,8935
feature,0,val,2234
minor,1,train,10438
minor,1,val,2610
normal,2,train,32627
normal,2,val,8156


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    data[data['data_type']=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    data[data['data_type']=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
from torch.utils.data import TensorDataset

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data['data_type']=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data['data_type']=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [10]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 

batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [11]:
from transformers import get_linear_schedule_with_warmup

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(label_dict),output_attentions=False,output_hidden_states=False)
model.to(device)

epochs = 10

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total / len(dataloader_val)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    return loss_val_avg, predictions, true_vals

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    print(classification_report(preds_flat, labels_flat, target_names=label_dict))
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
    return np.sum(preds_flat == labels_flat) / len(labels_flat)

import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [13]:
from sklearn.metrics import accuracy_score, precision_score

seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

import time
total_t0 = time.time()

for epoch in range(epochs):
    print("")
    print(f'======== Epoch {epoch+1} / {epochs} ========')
    print('Training...')
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch+1), leave=False, disable=False)
    t0 = time.time()
    total_train_loss = 0
    all_preds = []
    all_labels = []
    model.train()
    for batch in progress_bar:
        
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        model.zero_grad()
     
        outputs = model(**inputs)

        loss = outputs[0]
        total_train_loss += loss.item()

        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()

        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        loss.backward()
        optimizer.step()
        scheduler.step()

########################################

        labels = inputs['labels'].cpu().numpy()
        
        # Update progress bar
        progress_bar.set_postfix({
            'training_loss': '{:.3f}'.format(loss.item() / len(batch))
        })

    avg_train_loss = total_train_loss / len(dataloader_train)
    print("")
    print(f'Average training loss: {avg_train_loss}')
    print(f'Training epoch took: {format_time(time.time()-t0)}')
    print("")
    print('Running Validation...')
    t0 = time.time()
    
    loss_val, predictions, true_vals = evaluate(dataloader_validation)
    val_acc = flat_accuracy(predictions, true_vals)
    print(f'Validation loss: {loss_val}')
    print(f'Validation Accuracy: {val_acc}')
    print(f'Validation epoch took: {format_time(time.time()-t0)}')
    progress_bar.update(1)

    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    # save training stats at last
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': loss_val,
            'Valid. Accur.': val_acc,
            'Training Time': format_time(time.time()-t0),
            'Validation Time': format_time(time.time()-t0)
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


                                                                                   


Average training loss: 0.6687769489174924
Training epoch took: 0:58:11

Running Validation...
              precision    recall  f1-score   support

     feature       0.73      0.78      0.76      2089
       minor       0.53      0.60      0.56      2323
      normal       0.91      0.82      0.86      9104
    critical       0.45      0.80      0.57       646

    accuracy                           0.78     14162
   macro avg       0.66      0.75      0.69     14162
weighted avg       0.80      0.78      0.79     14162

Class: feature
Accuracy: 1639/2234

Class: minor
Accuracy: 1390/2610

Class: normal
Accuracy: 7456/8156

Class: critical
Accuracy: 518/1162

Validation loss: 0.6314546233577683
Validation Accuracy: 0.776938285552888
Validation epoch took: 0:03:26

Training...


                                                                                   


Average training loss: 0.5630321384233528
Training epoch took: 0:57:31

Running Validation...
              precision    recall  f1-score   support

     feature       0.70      0.84      0.76      1868
       minor       0.56      0.60      0.58      2430
      normal       0.92      0.82      0.87      9136
    critical       0.49      0.79      0.61       728

    accuracy                           0.78     14162
   macro avg       0.67      0.76      0.70     14162
weighted avg       0.81      0.78      0.79     14162

Class: feature
Accuracy: 1560/2234

Class: minor
Accuracy: 1467/2610

Class: normal
Accuracy: 7494/8156

Class: critical
Accuracy: 573/1162

Validation loss: 0.5997036988422823
Validation Accuracy: 0.7833639316480723
Validation epoch took: 0:03:35

Training...


                                                                                   


Average training loss: 0.4470862317265109
Training epoch took: 0:59:15

Running Validation...
              precision    recall  f1-score   support

     feature       0.70      0.84      0.76      1855
       minor       0.53      0.62      0.57      2220
      normal       0.92      0.81      0.86      9244
    critical       0.53      0.73      0.62       843

    accuracy                           0.78     14162
   macro avg       0.67      0.75      0.70     14162
weighted avg       0.81      0.78      0.79     14162

Class: feature
Accuracy: 1560/2234

Class: minor
Accuracy: 1379/2610

Class: normal
Accuracy: 7519/8156

Class: critical
Accuracy: 617/1162

Validation loss: 0.6625351852854152
Validation Accuracy: 0.7820223132325943
Validation epoch took: 0:03:48

Training...


                                                                                    


Average training loss: 0.31130739742903957
Training epoch took: 0:59:39

Running Validation...
              precision    recall  f1-score   support

     feature       0.68      0.82      0.75      1856
       minor       0.58      0.57      0.57      2663
      normal       0.88      0.83      0.85      8679
    critical       0.56      0.67      0.61       964

    accuracy                           0.77     14162
   macro avg       0.68      0.72      0.70     14162
weighted avg       0.78      0.77      0.77     14162

Class: feature
Accuracy: 1529/2234

Class: minor
Accuracy: 1510/2610

Class: normal
Accuracy: 7178/8156

Class: critical
Accuracy: 650/1162

Validation loss: 0.7604679271788982
Validation Accuracy: 0.7673351221578874
Validation epoch took: 0:03:38

Training...


                                                                                   


Average training loss: 0.19815438093094218
Training epoch took: 0:57:50

Running Validation...
              precision    recall  f1-score   support

     feature       0.70      0.78      0.74      2007
       minor       0.57      0.57      0.57      2590
      normal       0.87      0.83      0.85      8595
    critical       0.57      0.69      0.62       970

    accuracy                           0.77     14162
   macro avg       0.68      0.72      0.70     14162
weighted avg       0.77      0.77      0.77     14162

Class: feature
Accuracy: 1573/2234

Class: minor
Accuracy: 1488/2610

Class: normal
Accuracy: 7128/8156

Class: critical
Accuracy: 665/1162

Validation loss: 0.897299155552194
Validation Accuracy: 0.7664171727157181
Validation epoch took: 0:03:30

Training...


                                                                                   


Average training loss: 0.12852609626176312
Training epoch took: 0:58:54

Running Validation...
              precision    recall  f1-score   support

     feature       0.69      0.79      0.74      1955
       minor       0.56      0.57      0.57      2587
      normal       0.88      0.82      0.85      8674
    critical       0.56      0.69      0.62       946

    accuracy                           0.76     14162
   macro avg       0.67      0.72      0.69     14162
weighted avg       0.77      0.76      0.77     14162

Class: feature
Accuracy: 1549/2234

Class: minor
Accuracy: 1474/2610

Class: normal
Accuracy: 7153/8156

Class: critical
Accuracy: 651/1162

Validation loss: 1.0148873300301535
Validation Accuracy: 0.7645106623358283
Validation epoch took: 0:03:34

Training...


                                                                                   


Average training loss: 0.0837817540198867
Training epoch took: 0:58:26

Running Validation...
              precision    recall  f1-score   support

     feature       0.71      0.76      0.74      2103
       minor       0.55      0.58      0.57      2470
      normal       0.88      0.83      0.85      8666
    critical       0.55      0.69      0.61       923

    accuracy                           0.77     14162
   macro avg       0.67      0.72      0.69     14162
weighted avg       0.78      0.77      0.77     14162

Class: feature
Accuracy: 1595/2234

Class: minor
Accuracy: 1440/2610

Class: normal
Accuracy: 7165/8156

Class: critical
Accuracy: 640/1162

Validation loss: 1.2204877968783578
Validation Accuracy: 0.7654286117779975
Validation epoch took: 0:03:32

Training...


                                                                                     


Average training loss: 0.054053249243607764
Training epoch took: 1:00:10

Running Validation...
              precision    recall  f1-score   support

     feature       0.72      0.76      0.74      2096
       minor       0.54      0.58      0.56      2426
      normal       0.88      0.82      0.85      8787
    critical       0.53      0.72      0.61       853

    accuracy                           0.76     14162
   macro avg       0.67      0.72      0.69     14162
weighted avg       0.78      0.76      0.77     14162

Class: feature
Accuracy: 1600/2234

Class: minor
Accuracy: 1400/2610

Class: normal
Accuracy: 7210/8156

Class: critical
Accuracy: 615/1162

Validation loss: 1.3087178790483873
Validation Accuracy: 0.7643694393447253
Validation epoch took: 0:03:27

Training...


                                                                                    


Average training loss: 0.03450054061748316
Training epoch took: 0:57:19

Running Validation...
              precision    recall  f1-score   support

     feature       0.71      0.76      0.73      2069
       minor       0.53      0.59      0.56      2325
      normal       0.88      0.82      0.85      8770
    critical       0.56      0.66      0.61       998

    accuracy                           0.76     14162
   macro avg       0.67      0.71      0.69     14162
weighted avg       0.78      0.76      0.77     14162

Class: feature
Accuracy: 1580/2234

Class: minor
Accuracy: 1373/2610

Class: normal
Accuracy: 7214/8156

Class: critical
Accuracy: 656/1162

Validation loss: 1.4417113799380665
Validation Accuracy: 0.7642282163536224
Validation epoch took: 0:03:23

Training...


                                                                                    


Average training loss: 0.023557862836267943
Training epoch took: 0:56:51

Running Validation...
              precision    recall  f1-score   support

     feature       0.71      0.78      0.74      2018
       minor       0.55      0.58      0.57      2458
      normal       0.88      0.82      0.85      8734
    critical       0.56      0.68      0.61       952

    accuracy                           0.77     14162
   macro avg       0.67      0.72      0.69     14162
weighted avg       0.78      0.77      0.77     14162

Class: feature
Accuracy: 1575/2234

Class: minor
Accuracy: 1433/2610

Class: normal
Accuracy: 7191/8156

Class: critical
Accuracy: 646/1162

Validation loss: 1.4985755314940652
Validation Accuracy: 0.7657816692557549
Validation epoch took: 0:03:26

Training complete!
Total training took 10:19:32 (h:mm:ss)
