In [2]:
#!pip install seqeval

In [45]:
import numpy as np
import pandas as pd
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForTokenClassification, BertTokenizer, BertConfig, BertModel
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import json
import os
from torch.nn.parallel import DataParallel
from sklearn.metrics import f1_score
from collections import defaultdict
from torch import cuda
from pytorch_lightning import Trainer

In [46]:
def def_value():
    return 'O'


def read_data(path):
    with open(path) as f:
        sent_dict = {}
        label_dict = {}
        count = 0
        for line in f:
            if line.isspace():
                continue
            if '|' in line and len(line.split('|')) == 3 and (line.split('|')[1] == 'a' or line.split('|')[1] == 't'):
                idx, _, sentence = line.split('|')
                sent_dict[idx] = sent_dict.get(idx, '') + ' ' + sentence
            else:
                idx, start_pos, end_pos, word, label, _ = line.split('\t')
                if idx not in label_dict:
                    label_dict[idx] = defaultdict(def_value)
                    for i in range(int(start_pos), int(end_pos)):
                        label_dict[idx][i] = label  
                else:
                    for i in range(int(start_pos), int(end_pos)):
                        label_dict[idx][i] = label
                        
    idx_col, word_col, label_col = [], [], []
    for idx in sent_dict:
        sentence = sent_dict[idx].replace('\n', '')
        
        char_seq = 0
        for word in sentence.split(' ')[1:]:
            label = label_dict[idx][char_seq]
            if word and word[0] == '(':
                label = label_dict[idx][char_seq + 1]
            char_seq += len(word) + 1
            
            idx_col.append(idx)
            word_col.append(word)
            label_col.append(label)
    
    df = pd.DataFrame(list(zip(idx_col, word_col, label_col)),
               columns =['sentence_id', 'word', 'label'])
    return df


class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda x: [(w, t) for w, t in zip(x["word"].values.tolist(),
                                                        x["label"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            sentence = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return sentence
        except:
            return None
        
        
# Creating new lists and dicts that will be used at a later stage for reference and processing
def get_data(df, label_vals):
    getter = SentenceGetter(df)
    label2idx = {value: key for key, value in enumerate(label_vals)}
    sentences = [' '.join([s[0] for s in sentence]) for sentence in getter.sentences]
    labels = [[s[1] for s in sentence] for sentence in getter.sentences]
    labels = [[label2idx.get(l) for l in label] for label in labels]
    return sentences, labels

In [47]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        sentence = str(self.sentences[index])
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        label = self.labels[index]
        label.extend([4]*200)
        label=label[:200]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'tags': torch.tensor(label, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

class BERTClass(torch.nn.Module):
    def __init__(self, model_path):
        super(BERTClass, self).__init__()
        self.bert = transformers.BertForTokenClassification.from_pretrained(model_path, 
                                                                            num_labels=18,
                                                                            )

    
    def forward(self, ids, mask, labels):
        output = self.bert(ids, mask, labels = labels)

        return output

def train(epoch):
    model.train()
    for step, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['tags'].to(device, dtype = torch.long)

        loss = model(ids, mask, labels = targets)[0]
        
        optimizer.zero_grad()
        
        loss.sum().backward()
        optimizer.step()
        
        if step % 5==0:
            print(f'Epoch: {epoch}  Step: {step}  Loss: {loss.sum()}')
            
def valid(model, testing_loader, label_vals):
    model.eval()
    eval_loss = 0
    predictions , true_labels = [], []
    nb_eval_steps = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['tags'].to(device, dtype = torch.long)

            output = model(ids, mask, labels=targets)
            loss, logits = output[:2]
            logits = logits.detach().cpu().numpy()
            label_ids = targets.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.append(label_ids)
            eval_loss += loss.mean().item()
            nb_eval_steps += 1
        eval_loss = eval_loss/nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        pred_tags = [label_vals[p_i] for p in predictions for p_i in p]
        valid_tags = [label_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
        print("F1-Score: {}".format(f1_score(pred_tags, valid_tags, average='micro')))
    return pred_tags, valid_tags

In [48]:
device = 'cuda' if cuda.is_available() else 'cpu'

df_train = read_data('./NCBI-disease/NCBItrainset_corpus.txt')
df_valid = read_data('./NCBI-disease/NCBIdevelopset_corpus.txt')

label_vals = list(df_train["label"].value_counts().keys())
label2idx = {value: key for key, value in enumerate(label_vals)}

train_sentences, train_labels = get_data(df_train, label_vals)
valid_sentences, valid_labels = get_data(df_valid, label_vals)

In [None]:
label2idx

In [8]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 4
EPOCHS = 6
LEARNING_RATE = 5e-05
tokenizer = BertTokenizer.from_pretrained('../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')

training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN)
valid_set = CustomDataset(tokenizer, valid_sentences, valid_labels, MAX_LEN)

training_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=VALID_BATCH_SIZE, shuffle=True)



optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)



In [9]:
for epoch in range(EPOCHS):
    train(epoch)
pred_tags, valid_tags = valid(model, valid_loader, label_vals)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0  Step: 0  Loss: 5.617720603942871
Epoch: 0  Step: 5  Loss: 1.8589186668395996
Epoch: 0  Step: 10  Loss: 1.5983073711395264
Epoch: 0  Step: 15  Loss: 1.4750643968582153
Epoch: 1  Step: 0  Loss: 1.3763785362243652
Epoch: 1  Step: 5  Loss: 1.1906495094299316
Epoch: 1  Step: 10  Loss: 1.222059965133667
Epoch: 1  Step: 15  Loss: 1.1399253606796265
Epoch: 2  Step: 0  Loss: 1.0465832948684692
Epoch: 2  Step: 5  Loss: 1.012534499168396
Epoch: 2  Step: 10  Loss: 1.1112782955169678
Epoch: 2  Step: 15  Loss: 0.898760974407196
Epoch: 3  Step: 0  Loss: 0.8962532877922058
Epoch: 3  Step: 5  Loss: 0.9761052131652832
Epoch: 3  Step: 10  Loss: 1.060213327407837
Epoch: 3  Step: 15  Loss: 0.9729636907577515
Epoch: 4  Step: 0  Loss: 0.824504554271698
Epoch: 4  Step: 5  Loss: 1.0439521074295044
Epoch: 4  Step: 10  Loss: 0.9217320084571838
Epoch: 4  Step: 15  Loss: 0.7990142107009888
Epoch: 5  Step: 0  Loss: 0.7839782238006592
Epoch: 5  Step: 5  Loss: 0.8049127459526062
Epoch: 5  Step: 10  Loss: 0.

In [None]:
'../../Modeling/checkpoints/model-trained-36-130647.pt/'