In [1]:
import sys
import os
import time
import importlib
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
from torch.utils.data.distributed import DistributedSampler
from torch.utils import data
from tqdm import tqdm, trange
import collections

In [2]:
cuda_yes = torch.cuda.is_available()
device = torch.device("cuda:1" if cuda_yes else "cpu")
print('Device:', device)

Device: cuda:1


In [3]:
from transformers import AdamW
from random import shuffle

In [4]:
class InputExample(object):
    """A single training/test example for NER."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.
        Args:
          guid: Unique id for the example(a sentence or a pair of sentences).
          words: list of words of sentence
          labels_a/labels_b: (Optional) string. The label seqence of the text_a/text_b. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        # list of words of the sentence,example: [EU, rejects, German, call, to, boycott, British, lamb .]
        self.words = words
        # list of label sequence of the sentence,like: [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]
        self.labels = labels

In [5]:
class InputFeatures(object):
    """A single set of features of data.
    result of convert_examples_to_features(InputExample)
    """

    def __init__(self, input_ids, input_mask, segment_ids,  predict_mask, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.predict_mask = predict_mask
        self.label_ids = label_ids

In [6]:
from sklearn.model_selection import train_test_split

class CoNLLDataProcessor():
    '''
    CoNLL-2003
    '''

    def __init__(self, out_lists):
        self.data = out_lists
#         shuffle(self.data)
        self._label_types = ['B-Command', 'B-Error', 'B-Extension', 'B-Software_Component', 'B-Peripheral',
       'B-OS', 'B-Package', 'B-Architecture', 'B-Organization', 'I-Command', 'I-Error', 'I-Extension',
       'I-Software_Component', 'I-Peripheral', 'I-OS', 'I-Package', 'I-Architecture', 'I-Organization',
                             '[CLS]' , '[SEP]' , 'O']
        self._num_labels = len(self._label_types)
        self._label_map = {label: i for i,
                           label in enumerate(self._label_types)}
        self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=1)
        self.train_data, self.valid_data = train_test_split(self.train_data, test_size=0.25, random_state=1)

    def get_train_examples(self):
#         return self._create_examples(self.data[:len(self.data) * 3 // 5 ])
        return self._create_examples(self.train_data)
    
    def get_valid_examples(self):
#         return self._create_examples(self.data[len(self.data) * 3 // 5:len(self.data) * 4 // 5])
        return self._create_examples(self.valid_data)
    
    def get_test_examples(self):
#         return self._create_examples(self.data[len(self.data) * 4 // 5:])
        return self._create_examples(self.test_data)

    def get_labels(self):
        return self._label_types

    def get_num_labels(self):
        return self._num_labels
    
    def get_label_map(self):
        return self._label_map
    
    def get_start_label_id(self):
        return self._label_map['[CLS]']

    def get_stop_label_id(self):
        return self._label_map['[SEP]']

    def _create_examples(self, all_lists):
        examples = []
        for (i, one_lists) in enumerate(all_lists):
            guid = i
            words = one_lists[0]
            labels = one_lists[-1]
            examples.append(InputExample(
                guid=guid, words=words, labels=labels))
        return examples

    def _create_examples2(self, lines):
        examples = []
        for (i, line) in enumerate(lines):
            guid = i
            text = line[0]
            ner_label = line[-1]
            examples.append(InputExample(
                guid=guid, text_a=text, labels_a=ner_label))
        return examples

In [7]:
import pandas as pd
def example2feature(example, tokenizer, label_map, max_seq_length):

    add_label = 'X'
    # tokenize_count = []
    tokens = ['[CLS]']
    predict_mask = [0]
    label_ids = [label_map['[CLS]']]
    for i, w in enumerate(example.words):
        # use bertTokenizer to split words
        # 1996-08-22 => 1996 - 08 - 22
        # sheepmeat => sheep ##me ##at
        try:
            if(pd.isna(w)):
                sub_words = ['[UNK]']
            else:
                sub_words = tokenizer.tokenize(w)
        except:
            print(pd.isna(w))
            print(type(w))
            sub_words = tokenizer.tokenize(w)
        if not sub_words:
            sub_words = ['[UNK]']
        # tokenize_count.append(len(sub_words))
        tokens.extend(sub_words)
        for j in range(len(sub_words)):
            if j == 0:
                predict_mask.append(1)
                label_ids.append(label_map[example.labels[i]])
            else:
                # '##xxx' -> 'X' (see bert paper)
                predict_mask.append(0)
                label_ids.append(label_map[example.labels[i]])

    # truncate
    if len(tokens) > max_seq_length - 1:
        #print('Example No.{} is too long, length is {}, truncated to {}!'.format(example.guid, len(tokens), max_seq_length))
        tokens = tokens[0:(max_seq_length - 1)]
        predict_mask = predict_mask[0:(max_seq_length - 1)]
        label_ids = label_ids[0:(max_seq_length - 1)]
    tokens.append('[SEP]')
    predict_mask.append(0)
    label_ids.append(label_map['[SEP]'])

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    segment_ids = [0] * len(input_ids)
    input_mask = [1] * len(input_ids)

    feat=InputFeatures(
                # guid=example.guid,
                # tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                predict_mask=predict_mask,
                label_ids=label_ids)

    return feat

In [8]:
class NerDataset(data.Dataset):
    def __init__(self, examples, tokenizer, label_map, max_seq_length):
        self.examples=examples
        self.tokenizer=tokenizer
        self.label_map=label_map
        self.max_seq_length=max_seq_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        feat=example2feature(self.examples[idx], self.tokenizer, 
                             self.label_map, self.max_seq_length)
        return feat.input_ids, feat.input_mask, feat.segment_ids, feat.predict_mask, feat.label_ids

    @classmethod
    def pad(cls, batch):

        seqlen_list = [len(sample[0]) for sample in batch]
        maxlen = np.array(seqlen_list).max()

        f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: X for padding
        input_ids_list = torch.LongTensor(f(0, maxlen))
        input_mask_list = torch.LongTensor(f(1, maxlen))
        segment_ids_list = torch.LongTensor(f(2, maxlen))
        predict_mask_list = torch.LongTensor(f(3, maxlen))
        label_ids_list = torch.LongTensor(f(4, maxlen))

        return input_ids_list, input_mask_list, segment_ids_list, predict_mask_list, label_ids_list

In [9]:
import pandas as pd
data_df = pd.read_csv('~/sr_drive/Avik/Annotations/results/updated_merged_BIO_tagging_removing_verb_adjacent_pos_tags.csv')
data_df.iloc[10:20]

Unnamed: 0,Bug Id,Start Index,End Index,Entity,POS Tag,BIO Tag,year
10,143211,67,72,which,WDT,O,2004
11,143211,73,75,is,VBZ,O,2004
12,143211,76,82,called,VBN,O,2004
13,143211,83,85,on,IN,O,2004
14,143211,86,89,dav,NN,O,2004
15,143211,89,91,'s,POS,O,2004
16,143211,92,99,PROPGET,NNP,O,2004
17,143211,100,102,to,TO,O,2004
18,143211,103,111,retrieve,VB,O,2004
19,143211,112,115,the,DT,O,2004


In [10]:
data_group = data_df.groupby(
['Bug Id'],as_index=False
)['Entity', 'POS Tag', 'BIO Tag'].agg(lambda x: list(x))

data_group.head()

  data_group = data_df.groupby(


Unnamed: 0,Bug Id,Entity,POS Tag,BIO Tag
0,501,"[Like, subscribing, to, a, bug, ,, I, 'd, like...","[IN, VBG, TO, DT, NN, ,, PRP, MD, VB, TO, VB, ...","[O, O, O, O, B-Package, O, O, O, O, O, O, O, O..."
1,3165,"[Symptoms, ========, Launchpad, sends, notific...","[NNS, VBP, NNP, VBZ, NNS, TO, NNS, IN, JJ, NNS...","[O, O, B-Organization, O, O, O, O, O, O, O, O,..."
2,3651,"[I, 'm, looking, at, bzr, baz-import, ,, which...","[PRP, VBP, VBG, IN, JJ, NN, ,, WDT, VBZ, IN, P...","[O, O, O, O, B-Package, B-Command, O, O, O, O,..."
3,3666,"['Track, Artist, ', is, a, great, concept, and...","[CD, NNP, '', VBZ, DT, JJ, NN, CC, PRP, VBP, P...","[O, B-Package, O, O, O, O, O, O, O, O, O, O, O..."
4,3718,"[While, installing, packages, ,, one, may, exp...","[IN, VBG, NNS, ,, CD, MD, VB, DT, NN, TO, VB, ...","[O, O, O, O, O, O, O, O, B-Package, O, O, O, B..."


In [11]:
# folder_path = '/content/drive/MyDrive/BTP/bugdataset/'
folder_path = '~/sr_drive/Avik/Annotations/'
bug_description_path = folder_path

bug_description_df = pd.read_csv(bug_description_path + 'descriptions_merged.csv')

for _, row in bug_description_df.iterrows():
    if(row['Bug Id'] == 18886):
        print(row['Description'])

If I insert in my CD-RW drive a CD labelled "1", then click on the diskmounter
applet, the 2 choices I get will be "Open CD-RW(null) Drive" and "Eject
CD-RW(null) Drive".  In Hoary, the choices would be the nicer "Open 1" and
"Eject 1".

http://bugzilla.gnome.org/show_bug.cgi?id=310300: http://bugzilla.gnome.org/show_bug.cgi?id=310300


In [12]:
import pandas as pd
out_lists = []
flag = 0
for idx, row in data_group.iterrows():
#     for w_idx, w in enumerate(row['Entity']):
#         if(pd.isna(w)):
#             print(w_idx)
#             print(row['Entity'])
#             print(row['Bug Id'])
#             print(row['BIO Tag'][w_idx])
#             flag += 1
#             break
#     if flag == 2:
#         break
    out_lists.append([row['Entity'], row['BIO Tag']])

In [13]:
def printDistribution(examples):
    labels = []
    for example in examples:
        labels.extend(example.labels)
    label_frequency = {}

    # iterating over the list
    for label in labels:
       # checking the element in dictionary
        if label in label_frequency:
          # incrementing the counr
            label_frequency[label] += 1
        else:
          # initializing the count
            label_frequency[label] = 1

    # printing the frequency
    print(label_frequency)

In [14]:
conllProcessor = CoNLLDataProcessor(out_lists)
label_list = conllProcessor.get_labels()
label_map = conllProcessor.get_label_map()
train_examples = conllProcessor.get_train_examples()
valid_examples = conllProcessor.get_valid_examples()
test_examples = conllProcessor.get_test_examples()

In [19]:
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('SpanBERT/spanbert-base-cased')

In [20]:
train_dataset = NerDataset(train_examples, tokenizer, label_map, 512)
valid_dataset = NerDataset(valid_examples, tokenizer, label_map, 512)
test_dataset = NerDataset(test_examples, tokenizer, label_map, 512)

In [22]:
batch_size = 4

In [23]:
train_dataloader = data.DataLoader(dataset=train_dataset,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=1,
                                collate_fn=NerDataset.pad)

valid_dataloader = data.DataLoader(dataset=valid_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=NerDataset.pad)

test_dataloader = data.DataLoader(dataset=test_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=NerDataset.pad)

In [24]:
class SpanBERT_CRF(nn.Module):
    def __init__(self, num_labels, dropout=0.1):
        super().__init__()
        self.num_labels = num_labels
        self.spanbert = AutoModel.from_pretrained('SpanBERT/spanbert-base-cased')
        self.dropout = nn.Dropout(p=dropout)
        self.classifier = nn.Linear(self.spanbert.config.hidden_size, num_labels)
        self.crf = nn.CRF(num_labels, batch_first=True)
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.spanbert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        emissions = self.classifier(sequence_output)
        
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.byte())
            return loss
        else:
            return self.crf.decode(emissions, mask=attention_mask.byte())

In [None]:
model = SpanBERT_CRF(num_labels=num_labels)