In [None]:
# 1. change BERT model from multilingual to uncased
# 2. analyse reason for 'nan'

In [None]:
# reference: https://www.kaggle.com/code/realdeo/ner-crf-pytorch

In [1]:
import sys
import os
import time
import importlib
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
from torch.utils.data.distributed import DistributedSampler
from torch.utils import data
from tqdm import tqdm, trange
import collections

In [2]:
cuda_yes = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda_yes else "cpu")
print('Device:', device)

Device: cuda:0


In [3]:
from transformers import AdamW

In [4]:
from random import shuffle

In [5]:
class InputExample(object):
    """A single training/test example for NER."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.
        Args:
          guid: Unique id for the example(a sentence or a pair of sentences).
          words: list of words of sentence
          labels_a/labels_b: (Optional) string. The label seqence of the text_a/text_b. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        # list of words of the sentence,example: [EU, rejects, German, call, to, boycott, British, lamb .]
        self.words = words
        # list of label sequence of the sentence,like: [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]
        self.labels = labels

In [6]:
class InputFeatures(object):
    """A single set of features of data.
    result of convert_examples_to_features(InputExample)
    """

    def __init__(self, input_ids, input_mask, segment_ids,  predict_mask, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.predict_mask = predict_mask
        self.label_ids = label_ids

In [7]:
from sklearn.model_selection import train_test_split

class CoNLLDataProcessor():
    '''
    CoNLL-2003
    '''

    def __init__(self, out_lists):
        self.data = out_lists
#         shuffle(self.data)
        self._label_types = ['B-Command', 'B-Error', 'B-Extension', 'B-Software_Component', 'B-Peripheral',
       'B-OS', 'B-Package', 'B-Architecture', 'B-Organization', 'I-Command', 'I-Error', 'I-Extension',
       'I-Software_Component', 'I-Peripheral', 'I-OS', 'I-Package', 'I-Architecture', 'I-Organization', '[CLS]', '[SEP]','O']
        self._num_labels = len(self._label_types)
        self._label_map = {label: i for i,
                           label in enumerate(self._label_types)}
        self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=1)
        self.train_data, self.valid_data = train_test_split(self.train_data, test_size=0.25, random_state=1)

    def get_train_examples(self):
#         return self._create_examples(self.data[:len(self.data) * 3 // 5 ])
        return self._create_examples(self.train_data)
    
    def get_valid_examples(self):
#         return self._create_examples(self.data[len(self.data) * 3 // 5:len(self.data) * 4 // 5])
        return self._create_examples(self.valid_data)
    
    def get_test_examples(self):
#         return self._create_examples(self.data[len(self.data) * 4 // 5:])
        return self._create_examples(self.test_data)

    def get_labels(self):
        return self._label_types

    def get_num_labels(self):
        return self._num_labels
    
    def get_label_map(self):
        return self._label_map
    
    def get_start_label_id(self):
        return self._label_map['[CLS]']

    def get_stop_label_id(self):
        return self._label_map['[SEP]']

    def _create_examples(self, all_lists):
        examples = []
        for (i, one_lists) in enumerate(all_lists):
            guid = i
            words = one_lists[0]
            labels = one_lists[-1]
            examples.append(InputExample(
                guid=guid, words=words, labels=labels))
        return examples

    def _create_examples2(self, lines):
        examples = []
        for (i, line) in enumerate(lines):
            guid = i
            text = line[0]
            ner_label = line[-1]
            examples.append(InputExample(
                guid=guid, text_a=text, labels_a=ner_label))
        return examples

In [8]:
import pandas as pd
def example2feature(example, tokenizer, label_map, max_seq_length):

    add_label = 'X'
    # tokenize_count = []
    tokens = ['[CLS]']
    predict_mask = [0]
    label_ids = [label_map['[CLS]']]
    for i, w in enumerate(example.words):
        # use bertTokenizer to split words
        # 1996-08-22 => 1996 - 08 - 22
        # sheepmeat => sheep ##me ##at
        try:
            if(pd.isna(w)):
                sub_words = ['[UNK]']
            else:
                sub_words = tokenizer.tokenize(w)
        except:
            print(pd.isna(w))
            print(type(w))
            sub_words = tokenizer.tokenize(w)
        if not sub_words:
            sub_words = ['[UNK]']
        # tokenize_count.append(len(sub_words))
        tokens.extend(sub_words)
        for j in range(len(sub_words)):
            if j == 0:
                predict_mask.append(1)
                label_ids.append(label_map[example.labels[i]])
            else:
                # '##xxx' -> 'X' (see bert paper)
                predict_mask.append(0)
                label_ids.append(label_map[example.labels[i]])

    # truncate
    if len(tokens) > max_seq_length - 1:
        #print('Example No.{} is too long, length is {}, truncated to {}!'.format(example.guid, len(tokens), max_seq_length))
        tokens = tokens[0:(max_seq_length - 1)]
        predict_mask = predict_mask[0:(max_seq_length - 1)]
        label_ids = label_ids[0:(max_seq_length - 1)]
    tokens.append('[SEP]')
    predict_mask.append(0)
    label_ids.append(label_map['[SEP]'])

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    segment_ids = [0] * len(input_ids)
    input_mask = [1] * len(input_ids)

    feat=InputFeatures(
                # guid=example.guid,
                # tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                predict_mask=predict_mask,
                label_ids=label_ids)

    return feat

In [9]:
class NerDataset(data.Dataset):
    def __init__(self, examples, tokenizer, label_map, max_seq_length):
        self.examples=examples
        self.tokenizer=tokenizer
        self.label_map=label_map
        self.max_seq_length=max_seq_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        feat=example2feature(self.examples[idx], self.tokenizer, 
                             self.label_map, self.max_seq_length)
        return feat.input_ids, feat.input_mask, feat.segment_ids, feat.predict_mask, feat.label_ids

    @classmethod
    def pad(cls, batch):

        seqlen_list = [len(sample[0]) for sample in batch]
        maxlen = np.array(seqlen_list).max()

        f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: X for padding
        input_ids_list = torch.LongTensor(f(0, maxlen))
        input_mask_list = torch.LongTensor(f(1, maxlen))
        segment_ids_list = torch.LongTensor(f(2, maxlen))
        predict_mask_list = torch.LongTensor(f(3, maxlen))
        label_ids_list = torch.LongTensor(f(4, maxlen))

        return input_ids_list, input_mask_list, segment_ids_list, predict_mask_list, label_ids_list

In [10]:
import pandas as pd
data_df = pd.read_csv('~/sr_drive/Avik/Annotations/results/updated_merged_BIO_tagging_removing_verb_adjacent_pos_tags.csv')
data_df.iloc[10:20]

Unnamed: 0,Bug Id,Start Index,End Index,Entity,POS Tag,BIO Tag,year
10,143211,67,72,which,WDT,O,2004
11,143211,73,75,is,VBZ,O,2004
12,143211,76,82,called,VBN,O,2004
13,143211,83,85,on,IN,O,2004
14,143211,86,89,dav,NN,O,2004
15,143211,89,91,'s,POS,O,2004
16,143211,92,99,PROPGET,NNP,O,2004
17,143211,100,102,to,TO,O,2004
18,143211,103,111,retrieve,VB,O,2004
19,143211,112,115,the,DT,O,2004


In [11]:
data_group = data_df.groupby(
['Bug Id'],as_index=False
)['Entity', 'POS Tag', 'BIO Tag'].agg(lambda x: list(x))

data_group.head()

  data_group = data_df.groupby(


Unnamed: 0,Bug Id,Entity,POS Tag,BIO Tag
0,501,"[Like, subscribing, to, a, bug, ,, I, 'd, like...","[IN, VBG, TO, DT, NN, ,, PRP, MD, VB, TO, VB, ...","[O, O, O, O, B-Package, O, O, O, O, O, O, O, O..."
1,3165,"[Symptoms, ========, Launchpad, sends, notific...","[NNS, VBP, NNP, VBZ, NNS, TO, NNS, IN, JJ, NNS...","[O, O, B-Organization, O, O, O, O, O, O, O, O,..."
2,3651,"[I, 'm, looking, at, bzr, baz-import, ,, which...","[PRP, VBP, VBG, IN, JJ, NN, ,, WDT, VBZ, IN, P...","[O, O, O, O, B-Package, B-Command, O, O, O, O,..."
3,3666,"['Track, Artist, ', is, a, great, concept, and...","[CD, NNP, '', VBZ, DT, JJ, NN, CC, PRP, VBP, P...","[O, B-Package, O, O, O, O, O, O, O, O, O, O, O..."
4,3718,"[While, installing, packages, ,, one, may, exp...","[IN, VBG, NNS, ,, CD, MD, VB, DT, NN, TO, VB, ...","[O, O, O, O, O, O, O, O, B-Package, O, O, O, B..."


In [12]:
# folder_path = '/content/drive/MyDrive/BTP/bugdataset/'
folder_path = '~/sr_drive/Avik/Annotations/'
bug_description_path = folder_path

bug_description_df = pd.read_csv(bug_description_path + 'descriptions_merged.csv')

for _, row in bug_description_df.iterrows():
    if(row['Bug Id'] == 18886):
        print(row['Description'])

If I insert in my CD-RW drive a CD labelled "1", then click on the diskmounter
applet, the 2 choices I get will be "Open CD-RW(null) Drive" and "Eject
CD-RW(null) Drive".  In Hoary, the choices would be the nicer "Open 1" and
"Eject 1".

http://bugzilla.gnome.org/show_bug.cgi?id=310300: http://bugzilla.gnome.org/show_bug.cgi?id=310300


In [13]:
for _, row in bug_description_df.iterrows():
    if(row['Bug Id'] == 75641):
        print(row['Description'])

I installed the lkl 0.1.1-1 package from the universe repository. I'm running edgy on a Delll Inspron 6000. I setup lkl to run as a daemon normally started by init, but I can stop and start it with /etc/init.d/lkl stop/start.

When tailing the outfile (/var/log/lkl/lkl.log) Everything starts out correctly, but almost immediately the time/date stamp disappears and an array of other characters begin to appear:

root@paule2:~ # tail -F /var/log/lkl/lkl.log 
Wed Dec 13 12:28:18 2006
wow<Ret>
Wed Dec 13 12:28:28 2006
sure<Ret>
Wed Dec 13 12:28:36 2006
<Ctrl>t<Tab><Ret>
Wed Dec 13 12:29:14 2006
<Tab>kk<Ret>
Wed Dec 13 12:29:41 2006
<Alt>NULL<Alt>NULLNULL���NULLNULLNULLNULL<Alt>NULL<Alt>NULL<Alt> MSD<Tab><Tab>DYNULL��NULLNULLEuroNULL�NULLNULLNULL���NULL ���NULLNULLNULL�EuroEuroNULL�NULL�<Shift>"}}<Shift>{M 


In [14]:
import pandas as pd
out_lists = []
flag = 0
for idx, row in data_group.iterrows():
#     for w_idx, w in enumerate(row['Entity']):
#         if(pd.isna(w)):
#             print(w_idx)
#             print(row['Entity'])
#             print(row['Bug Id'])
#             print(row['BIO Tag'][w_idx])
#             flag += 1
#             break
#     if flag == 2:
#         break
    out_lists.append([row['Entity'], row['BIO Tag']])

In [15]:
def printDistribution(examples):
    labels = []
    for example in examples:
        labels.extend(example.labels)
    label_frequency = {}

    # iterating over the list
    for label in labels:
       # checking the element in dictionary
        if label in label_frequency:
          # incrementing the counr
            label_frequency[label] += 1
        else:
          # initializing the count
            label_frequency[label] = 1

    # printing the frequency
    print(label_frequency)

In [16]:
conllProcessor = CoNLLDataProcessor(out_lists)
label_list = conllProcessor.get_labels()
label_map = conllProcessor.get_label_map()
train_examples = conllProcessor.get_train_examples()
valid_examples = conllProcessor.get_valid_examples()
test_examples = conllProcessor.get_test_examples()

In [17]:
print(f'Number of training examples: {len(train_examples)}')
printDistribution(train_examples)

Number of training examples: 959
{'O': 207233, 'B-OS': 2461, 'I-OS': 67, 'B-Command': 13304, 'B-Package': 12050, 'B-Peripheral': 803, 'B-Architecture': 794, 'B-Software_Component': 496, 'B-Extension': 505, 'I-Command': 583, 'B-Error': 92, 'I-Error': 257, 'B-Organization': 97, 'I-Package': 29}


In [18]:
print(f'Number of validation examples: {len(valid_examples)}')
printDistribution(valid_examples)

Number of validation examples: 320
{'O': 70665, 'B-Command': 4549, 'B-Package': 4262, 'B-OS': 826, 'B-Architecture': 270, 'B-Peripheral': 262, 'B-Software_Component': 196, 'B-Extension': 360, 'I-Command': 222, 'B-Organization': 24, 'B-Error': 17, 'I-Error': 29, 'I-OS': 24, 'I-Package': 21, 'I-Peripheral': 2, 'I-Architecture': 1}


In [19]:
print(f'Number of test examples: {len(test_examples)}')
printDistribution(test_examples)

Number of test examples: 320
{'O': 75852, 'B-Package': 4477, 'B-OS': 816, 'B-Command': 4924, 'B-Architecture': 290, 'B-Peripheral': 310, 'B-Software_Component': 185, 'B-Error': 34, 'I-Error': 76, 'B-Organization': 30, 'B-Extension': 246, 'I-Command': 44, 'I-OS': 9, 'I-Package': 2, 'I-Peripheral': 1}


In [20]:
len(train_examples)

959

In [21]:
# label_list.remove('[CLS]')
# label_list.remove('[SEP]')

In [22]:
label_list

['B-Command',
 'B-Error',
 'B-Extension',
 'B-Software_Component',
 'B-Peripheral',
 'B-OS',
 'B-Package',
 'B-Architecture',
 'B-Organization',
 'I-Command',
 'I-Error',
 'I-Extension',
 'I-Software_Component',
 'I-Peripheral',
 'I-OS',
 'I-Package',
 'I-Architecture',
 'I-Organization',
 '[CLS]',
 '[SEP]',
 'O']

In [23]:
from transformers import AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")

train_dataset = NerDataset(train_examples, tokenizer, label_map, 512)
valid_dataset = NerDataset(valid_examples, tokenizer, label_map, 512)
test_dataset = NerDataset(test_examples, tokenizer, label_map, 512)

In [24]:
batch_size = 4

In [25]:
train_dataloader = data.DataLoader(dataset=train_dataset,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=1,
                                collate_fn=NerDataset.pad)

valid_dataloader = data.DataLoader(dataset=valid_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=NerDataset.pad)

test_dataloader = data.DataLoader(dataset=test_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=NerDataset.pad)

In [26]:
from transformers import AutoModel

#Change to bert-base-uncased
#bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("SpanBERT/spanbert-base-cased")

# bert_model.config.max_position_embeddings = 1024
# bert_model.embeddings.position_ids = 1024
# bert_model.embeddings.position_embeddings.weight.data = 1024

Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def log_sum_exp_1vec(vec):  # shape(1,m)
    max_score = vec[0, np.argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def log_sum_exp_mat(log_M, axis=-1):  # shape(n,m)
    return torch.max(log_M, axis)[0]+torch.log(torch.exp(log_M-torch.max(log_M, axis)[0][:, None]).sum(axis))

def log_sum_exp_batch(log_Tensor, axis=-1): # shape (batch_size,n,m)
    return torch.max(log_Tensor, axis)[0]+torch.log(torch.exp(log_Tensor-torch.max(log_Tensor, axis)[0].view(log_Tensor.shape[0],-1,1)).sum(axis))

In [28]:
class BERT_CRF_NER(nn.Module):

    def __init__(self, bert_model, start_label_id, stop_label_id, num_labels, max_seq_length, batch_size, device):
        super(BERT_CRF_NER, self).__init__()
        self.hidden_size = 768
        self.start_label_id = start_label_id
        self.stop_label_id = stop_label_id
        self.num_labels = num_labels
        #self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.device=device

        # use pretrainded BertModel 
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.2)
        # Maps the output of the bert into label space.
        self.hidden2label = nn.Linear(self.hidden_size, self.num_labels)

        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.num_labels, self.num_labels))

        # These two statements enforce the constraint that we never transfer *to* the start tag(or label),
        # and we never transfer *from* the stop label (the model would probably learn this anyway,
        # so this enforcement is likely unimportant)
        self.transitions.data[start_label_id, :] = -10000
        self.transitions.data[:, stop_label_id] = -10000

        nn.init.xavier_uniform_(self.hidden2label.weight)
        nn.init.constant_(self.hidden2label.bias, 0.0)
        # self.apply(self.init_bert_weights)

    def init_bert_weights(self, module):
        """ Initialize the weights.
        """
        if isinstance(module, (nn.Linear, nn.Embedding)): 
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def _forward_alg(self, feats):
        '''
        this also called alpha-recursion or forward recursion, to calculate log_prob of all barX 
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]
        
        # alpha_recursion,forward, alpha(zt)=p(zt,bar_x_1:t)
        log_alpha = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        # normal_alpha_0 : alpha[0]=Ot[0]*self.PIs
        # self.start_label has all of the score. it is log,0 is p=1
        log_alpha[:, 0, self.start_label_id] = 0
        
        # feats: sentances -> word embedding -> lstm -> MLP -> feats
        # feats is the probability of emission, feat.shape=(1,tag_size)
        for t in range(1, T):
            log_alpha = (log_sum_exp_batch(self.transitions + log_alpha, axis=-1) + feats[:, t]).unsqueeze(1)

        # log_prob of all barX
        log_prob_all_barX = log_sum_exp_batch(log_alpha)
        return log_prob_all_barX

    def _get_bert_features(self, input_ids, segment_ids, input_mask):
        '''
        sentances -> word embedding -> lstm -> MLP -> feats
        '''
        #bert_seq_out = self.bert(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)[0]
        bert_seq_out = self.bert(input_ids, attention_mask=input_mask)[0] # for spanBert
        bert_seq_out = self.dropout(bert_seq_out)
        bert_feats = self.hidden2label(bert_seq_out)
        return bert_feats

    def _score_sentence(self, feats, label_ids):
        ''' 
        Gives the score of a provided label sequence
        p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        batch_transitions = self.transitions.expand(batch_size,self.num_labels,self.num_labels)
        batch_transitions = batch_transitions.flatten(1)

        score = torch.zeros((feats.shape[0],1)).to(device)
        # the 0th node is start_label->start_word,the probability of them=1. so t begin with 1.
        for t in range(1, T):
            score = score + \
                batch_transitions.gather(-1, (label_ids[:, t]*self.num_labels+label_ids[:, t-1]).view(-1,1)) \
                    + feats[:, t].gather(-1, label_ids[:, t].view(-1,1)).view(-1,1)
        return score

    def _viterbi_decode(self, feats):
        '''
        Max-Product Algorithm or viterbi algorithm, argmax(p(z_0:t|x_0:t))
        '''
        
        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        # batch_transitions=self.transitions.expand(batch_size,self.num_labels,self.num_labels)

        log_delta = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        log_delta[:, 0, self.start_label_id] = 0
        
        # psi is for the vaule of the last latent that make P(this_latent) maximum.
        psi = torch.zeros((batch_size, T, self.num_labels), dtype=torch.long).to(self.device)  # psi[0]=0000 useless
        for t in range(1, T):
            # delta[t][k]=max_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # delta[t] is the max prob of the path from  z_t-1 to z_t[k]
            log_delta, psi[:, t] = torch.max(self.transitions + log_delta, -1)
            # psi[t][k]=argmax_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # psi[t][k] is the path choosed from z_t-1 to z_t[k],the value is the z_state(is k) index of z_t-1
            log_delta = (log_delta + feats[:, t]).unsqueeze(1)

        # trace back
        path = torch.zeros((batch_size, T), dtype=torch.long).to(self.device)

        # max p(z1:t,all_x|theta)
        max_logLL_allz_allx, path[:, -1] = torch.max(log_delta.squeeze(), -1)

        for t in range(T-2, -1, -1):
            # choose the state of z_t according the state choosed of z_t+1.
            path[:, t] = psi[:, t+1].gather(-1,path[:, t+1].view(-1,1)).squeeze()

        return max_logLL_allz_allx, path

    def neg_log_likelihood(self, input_ids, segment_ids, input_mask, label_ids):
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)
        forward_score = self._forward_alg(bert_feats)
        # p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        gold_score = self._score_sentence(bert_feats, label_ids)
        # - log[ p(X=w1:t,Zt=tag1:t)/p(X=w1:t) ] = - log[ p(Zt=tag1:t|X=w1:t) ]
        return torch.mean(forward_score - gold_score)

    # this forward is just for predict, not for train
    # dont confuse this with _forward_alg above.
    def forward(self, input_ids, segment_ids, input_mask):
        # Get the emission scores from the BiLSTM
        bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask)

        # Find the best path, given the features.
        score, label_seq_ids = self._viterbi_decode(bert_feats)
        return score, label_seq_ids


In [29]:
start_label_id = conllProcessor.get_start_label_id()
stop_label_id = conllProcessor.get_stop_label_id()

model = BERT_CRF_NER(bert_model, start_label_id, stop_label_id, 
                     len(label_list), 512 , batch_size, device)

In [30]:
start_epoch = 0
valid_acc_prev = 0
valid_f1_prev = 0

model.to(device)
print(1)

1


In [31]:
param_optimizer = list(model.named_parameters())
weight_decay_finetune = 1e-5 #0.01
weight_decay_crf_fc = 5e-6 #0.005
lr0_crf_fc = 8e-5
learning_rate0 = 5e-5

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
new_param = ['transitions', 'hidden2label.weight', 'hidden2label.bias']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': weight_decay_finetune},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': 0.0},
    {'params': [p for n, p in param_optimizer if n in ('transitions','hidden2label.weight')] \
        , 'lr':lr0_crf_fc, 'weight_decay': weight_decay_crf_fc},
    {'params': [p for n, p in param_optimizer if n == 'hidden2label.bias'] \
        , 'lr':lr0_crf_fc, 'weight_decay': 0.0}
]

optimizer = AdamW(model.parameters(), lr=learning_rate0, correct_bias=False)



In [32]:
def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

In [33]:
#from sklearn.metrics import classification_report
from seqeval.metrics import classification_report

In [34]:
def save_report(report_dict):
    df = pd.DataFrame(report_dict).transpose()
    df.to_csv('BERT_CRF_analysis.csv')

In [38]:
def evaluate(model, predict_dataloader, batch_size):
    # print("***** Running prediction *****")
    global label_list
    labels = label_list.copy()
    labels.remove('[CLS]')
    labels.remove('[SEP]')
    model.eval()
    all_preds = []
    all_labels = []
    total=0
    correct=0
    start = time.time()
    
    inverted_map = {}
    
    for I in label_map:
        inverted_map[label_map[I]] = I
    
    with torch.no_grad():
        for batch in predict_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
            score, label_seq_ids = model(input_ids, segment_ids, input_mask)
            
            predicted = np.array(label_seq_ids.cpu())
            mask = np.array(predict_mask.cpu())
            truth = np.array(label_ids.cpu())
            
            for M in range(len(mask)):
                all_preds.append([inverted_map[I] for I in predicted[M][mask[M]]])
                all_labels.append([inverted_map[I] for I in truth[M][mask[M]]])
    #print(set(all_labels))
    
    report = classification_report(all_labels , all_preds)
    #print(all_preds)
    # Save report in a file
    #report_dict = classification_report(all_labels , all_preds, output_dict=True, zero_division = 0)
    #save_report(report_dict)
    print(report)


In [39]:
from tqdm import tqdm_notebook as tqdm

In [37]:
total_train_epochs = 5
gradient_accumulation_steps = 1
total_train_steps = int(len(train_examples) / batch_size / gradient_accumulation_steps * total_train_epochs)
global_step_th = 0
warmup_proportion = 0.1
for epoch in range(total_train_epochs):
        tr_loss = 0
        train_start = time.time()
        model.train()
        optimizer.zero_grad()
        # for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        for step, batch in  tqdm(enumerate(train_dataloader)):

            if step % 100 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = time.time() - train_start

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {}.'.format(step, len(train_dataloader), elapsed))
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, predict_mask, label_ids = batch

            neg_log_likelihood = model.neg_log_likelihood(input_ids, segment_ids, input_mask, label_ids)

            if gradient_accumulation_steps > 1:
                neg_log_likelihood = neg_log_likelihood / gradient_accumulation_steps

            neg_log_likelihood.backward()

            tr_loss += neg_log_likelihood.item()

            if (step + 1) % gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = learning_rate0 * warmup_linear(global_step_th/total_train_steps, warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step_th += 1

        print('--------------------------------------------------------------')
        print("Epoch:{} completed, Total training's Loss: {}, Spend: {}m".format(epoch, tr_loss, (time.time() - train_start)/60.0))
        evaluate(model, valid_dataloader, batch_size)
# try:
    
# except Exception as e:
#     print(e)
#     print(batch)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in  tqdm(enumerate(train_dataloader)):


0it [00:00, ?it/s]

  Batch   100  of    240.    Elapsed: 136.80719828605652.
  Batch   200  of    240.    Elapsed: 264.281676530838.
--------------------------------------------------------------
Epoch:0 completed, Total training's Loss: 1528196.6522216797, Spend: 5.191502026716868m


TypeError: classification_report() got an unexpected keyword argument 'labels'

In [40]:
evaluate(model, test_dataloader, batch_size)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        CLS]       1.00      1.00      1.00     17592
     Command       1.00      0.18      0.31       904
   Extension       0.00      0.00      0.00       180
          OS       1.00      0.30      0.47       841
     Package       0.30      0.58      0.40      4726

   micro avg       0.77      0.86      0.81     24243
   macro avg       0.66      0.41      0.43     24243
weighted avg       0.86      0.86      0.83     24243



In [None]:
PATH = 'BERT_model/BERT_CRF.model'
torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, PATH)

In [None]:
crf_df = pd.read_csv('BERT_CRF_analysis.csv')
crf_df.head(15)