In [1]:
import os
import itertools
import pandas as pd
import numpy as np
# from datasets import Dataset
# from datasets import load_metric
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, BertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
task = "ner" 
model_checkpoint = "bert-base-cased"
batch_size = 16

tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

In [3]:
abstracts = {}
entities = {}
tags = set()

import csv

with open("chemdner_corpus/training.abstracts.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        abstracts[row[0]] = row[2]

with open("chemdner_corpus/training.annotations.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in entities:
            entities[row[0]][row[4]] = row[5]
            tags.add(row[5])
        else:
            entities[row[0]] = {row[4]:row[5]}
            tags.add(row[5])

with open("chemdner_corpus/development.abstracts.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in abstracts.keys():
            print("IN IN IN")
        abstracts[row[0]] = row[2]

with open("chemdner_corpus/development.annotations.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in entities:
            entities[row[0]][row[4]] = row[5]
            tags.add(row[5])
        else:
            entities[row[0]] = {row[4]:row[5]}
            tags.add(row[5])

abstracts_test = {}
entities_test = {}

with open("chemdner_corpus/evaluation.abstracts.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        abstracts_test[row[0]] = row[2]

with open("chemdner_corpus/evaluation.annotations.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in entities_test:
            entities_test[row[0]][row[4]] = row[5]
        else:
            entities_test[row[0]] = {row[4]:row[5]}



In [4]:
label2id = {k:v for v, k in enumerate(tags)}
label2id['O'] = 8

id2label = {v:k for k,v in label2id.items()}
id2label

{0: 'IDENTIFIER',
 1: 'MULTIPLE',
 2: 'NO CLASS',
 3: 'FORMULA',
 4: 'SYSTEMATIC',
 5: 'ABBREVIATION',
 6: 'FAMILY',
 7: 'TRIVIAL',
 8: 'O'}

In [5]:
from nltk.tokenize import word_tokenize
import pandas as pd
def get_tokens(abstract):
    text = abstracts[abstract]
    if abstract not in entities.keys():
        return
    tokenized = word_tokenize(text)
    entities_temp = []
    for token in tokenized:
        if token in entities[abstract].keys():
            entities_temp.append(entities[abstract][token])
        else:
            entities_temp.append('O')
    return pd.DataFrame({'tokens':[tokenized], 'ner_tags': [entities_temp]})

def get_tokens_test(abstract):
    text = abstracts_test[abstract]
    if abstract not in entities_test.keys():
        return
    tokenized = word_tokenize(text)
    entities_temp = []
    for token in tokenized:
        if token in entities_test[abstract].keys():
            entities_temp.append(entities_test[abstract][token])
        else:
            entities_temp.append('O')
    return pd.DataFrame({'tokens':[tokenized], 'ner_tags': [entities_temp]})

def get_all_tokens_and_ner_tags():
    return pd.concat([get_tokens(abstract) for abstract in abstracts.keys()]).reset_index().drop('index', axis=1), pd.concat([get_tokens_test(abstract) for abstract in abstracts_test.keys()]).reset_index().drop('index', axis=1)

all_data, test_data = get_all_tokens_and_ner_tags()
all_data["sentence"] = all_data["tokens"].apply(lambda x: " ".join(x))
all_data["word_labels"] = all_data["ner_tags"].apply(lambda x: ",".join(x))
all_data = all_data[["sentence", "word_labels"]]
test_data["sentence"] = test_data["tokens"].apply(lambda x: " ".join(x))
test_data["word_labels"] = test_data["ner_tags"].apply(lambda x: ",".join(x))
test_data = test_data[["sentence", "word_labels"]]

display(all_data.head())
display(test_data.head())

Unnamed: 0,sentence,word_labels
0,We implemented a two-step approach to detect p...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Aflatoxicosis is a cause of economic losses in...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,The aim of this study was to investigate the e...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Nuclear factor-κB ( NF-κB ) is a transcription...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,Chromium is widely used in the leather industr...,"SYSTEMATIC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."


Unnamed: 0,sentence,word_labels
0,"In this work , bioconjugation techniques are d...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,ABBREVIA..."
1,Abstract 1 . Organic anion transporting polype...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,Mucin 1 ( MUC1 ) is a heterodimeric protein fo...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,The potential-controlled incorporation of DOPC...,"O,O,O,O,ABBREVIATION,O,O,O,O,O,O,O,O,O,SYSTEMA..."
4,Grape seed phenolic extract ( GSE ) is predict...,"O,O,FAMILY,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."


In [6]:

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [7]:
cuda = torch.device("cuda")

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long, device=cuda),
              'mask': torch.tensor(attn_mask, dtype=torch.long, device=cuda),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long, device=cuda)
        } 
    
    def __len__(self):
        return self.len

In [8]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 20
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

train_dataset = all_data
test_dataset = test_data

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
model = BertForTokenClassification.from_pretrained('bert-base-cased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)

model.to(cuda)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

RuntimeError: No CUDA GPUs are available