In [1]:
import os
import itertools
import pandas as pd
import numpy as np
# from datasets import Dataset
# from datasets import load_metric
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, BertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
task = "ner" 
model_checkpoint = "bert-base-cased"
batch_size = 16

tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 18.7kB/s]
Downloading vocab.txt: 100%|██████████| 208k/208k [00:00<00:00, 3.84MB/s]
Downloading tokenizer.json: 100%|██████████| 426k/426k [00:00<00:00, 5.69MB/s]
Downloading config.json: 100%|██████████| 570/570 [00:00<00:00, 471kB/s]


In [3]:
abstracts = {}
entities = {}
tags = set()

import csv

with open("chemdner_corpus/training.abstracts.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        abstracts[row[0]] = row[2]

with open("chemdner_corpus/training.annotations.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in entities:
            entities[row[0]][row[4]] = row[5]
            tags.add(row[5])
        else:
            entities[row[0]] = {row[4]:row[5]}
            tags.add(row[5])

with open("chemdner_corpus/development.abstracts.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in abstracts.keys():
            print("IN IN IN")
        abstracts[row[0]] = row[2]

with open("chemdner_corpus/development.annotations.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in entities:
            entities[row[0]][row[4]] = row[5]
            tags.add(row[5])
        else:
            entities[row[0]] = {row[4]:row[5]}
            tags.add(row[5])

abstracts_test = {}
entities_test = {}

with open("chemdner_corpus/evaluation.abstracts.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        abstracts_test[row[0]] = row[2]

with open("chemdner_corpus/evaluation.annotations.txt", "r") as csvfile:
    csvreader = csv.reader(csvfile, delimiter="\t")
    for row in csvreader:
        if row[0] in entities_test:
            entities_test[row[0]][row[4]] = row[5]
        else:
            entities_test[row[0]] = {row[4]:row[5]}



In [4]:
label2id = {k:v for v, k in enumerate(tags)}
label2id['O'] = 8

id2label = {v:k for k,v in label2id.items()}
id2label

{0: 'NO CLASS',
 1: 'ABBREVIATION',
 2: 'MULTIPLE',
 3: 'IDENTIFIER',
 4: 'FORMULA',
 5: 'FAMILY',
 6: 'TRIVIAL',
 7: 'SYSTEMATIC',
 8: 'O'}

In [7]:
from nltk.tokenize import word_tokenize
import pandas as pd
def get_tokens(abstract):
    text = abstracts[abstract]
    if abstract not in entities.keys():
        return
    tokenized = word_tokenize(text)
    entities_temp = []
    for token in tokenized:
        if token in entities[abstract].keys():
            entities_temp.append(entities[abstract][token])
        else:
            entities_temp.append('O')
    return pd.DataFrame({'tokens':[tokenized], 'ner_tags': [entities_temp]})

def get_tokens_test(abstract):
    text = abstracts_test[abstract]
    if abstract not in entities_test.keys():
        return
    tokenized = word_tokenize(text)
    entities_temp = []
    for token in tokenized:
        if token in entities_test[abstract].keys():
            entities_temp.append(entities_test[abstract][token])
        else:
            entities_temp.append('O')
    return pd.DataFrame({'tokens':[tokenized], 'ner_tags': [entities_temp]})

def get_all_tokens_and_ner_tags():
    return pd.concat([get_tokens(abstract) for abstract in abstracts.keys()]).reset_index().drop('index', axis=1), pd.concat([get_tokens_test(abstract) for abstract in abstracts_test.keys()]).reset_index().drop('index', axis=1)

all_data, test_data = get_all_tokens_and_ner_tags()
all_data["sentence"] = all_data["tokens"].apply(lambda x: " ".join(x))
all_data["word_labels"] = all_data["ner_tags"].apply(lambda x: ",".join(x))
all_data = all_data[["sentence", "word_labels"]]
test_data["sentence"] = test_data["tokens"].apply(lambda x: " ".join(x))
test_data["word_labels"] = test_data["ner_tags"].apply(lambda x: ",".join(x))
test_data = test_data[["sentence", "word_labels"]]

display(all_data.head())
display(test_data.head())

Unnamed: 0,sentence,word_labels
0,We implemented a two-step approach to detect p...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Aflatoxicosis is a cause of economic losses in...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,The aim of this study was to investigate the e...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Nuclear factor-κB ( NF-κB ) is a transcription...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,Chromium is widely used in the leather industr...,"SYSTEMATIC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."


Unnamed: 0,sentence,word_labels
0,"In this work , bioconjugation techniques are d...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,ABBREVIA..."
1,Abstract 1 . Organic anion transporting polype...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,Mucin 1 ( MUC1 ) is a heterodimeric protein fo...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,The potential-controlled incorporation of DOPC...,"O,O,O,O,ABBREVIATION,O,O,O,O,O,O,O,O,O,SYSTEMA..."
4,Grape seed phenolic extract ( GSE ) is predict...,"O,O,FAMILY,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."


In [8]:

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [9]:
cuda = torch.device("cuda")

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long, device=cuda),
              'mask': torch.tensor(attn_mask, dtype=torch.long, device=cuda),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long, device=cuda)
        } 
    
    def __len__(self):
        return self.len

In [10]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 20
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

train_dataset = all_data
test_dataset = test_data

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [11]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [12]:
model = BertForTokenClassification.from_pretrained('bert-base-cased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)

model.to(cuda)

Downloading pytorch_model.bin: 100%|██████████| 416M/416M [00:10<00:00, 41.1MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertFor

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [13]:
from sklearn.metrics import accuracy_score

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [14]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.1830191612243652
Training loss per 100 training steps: 0.37859700966884596
Training loss per 100 training steps: 0.2735012875712333
Training loss per 100 training steps: 0.22399659811262276
Training loss per 100 training steps: 0.19492416736899765
Training loss per 100 training steps: 0.17800685683796982
Training loss per 100 training steps: 0.16333800518324293
Training loss per 100 training steps: 0.15203170280388095
Training loss per 100 training steps: 0.1428989600250672
Training loss per 100 training steps: 0.13461627772534793
Training loss per 100 training steps: 0.12766187958285347
Training loss per 100 training steps: 0.12219696822445113
Training loss per 100 training steps: 0.11724404280665614
Training loss per 100 training steps: 0.1133395359306657
Training loss per 100 training steps: 0.10958981677639876
Training loss epoch: 0.10757131044302586
Training accuracy epoch: 0.952765001052807
Training epoch: 2
Training loss 

In [15]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids']
            mask = batch['mask']
            targets = batch['targets']
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [16]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.037723347544670105
Validation loss per 100 evaluation steps: 0.07156972740232401
Validation loss per 100 evaluation steps: 0.0596951966621106
Validation loss per 100 evaluation steps: 0.06275576063589568
Validation loss per 100 evaluation steps: 0.05890676725271407
Validation loss per 100 evaluation steps: 0.06032970593179322
Validation loss per 100 evaluation steps: 0.05834420383165217
Validation loss per 100 evaluation steps: 0.057707627551374305
Validation loss per 100 evaluation steps: 0.058281365073936
Validation loss per 100 evaluation steps: 0.057181655127420045
Validation loss per 100 evaluation steps: 0.05673453858639346
Validation loss per 100 evaluation steps: 0.05672770538586122
Validation loss per 100 evaluation steps: 0.05667013730295195
Validation Loss: 0.05699603785483859
Validation Accuracy: 0.9839012716252663


In [17]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="first")
txt = "'Propane () is a three-carbon alkane with the molecular formula C3H8. It is a gas at standard temperature and pressure, but compressible to a transportable liquid. A by-product of natural gas processing and petroleum refining, it is commonly used as a fuel in domestic and industrial applications and in low-emissions public transportation. Discovered in 1857 by the French chemist Marcellin Berthelot, it became commercially available in the US by 1911. Propane is one of a group of liquefied petroleum gases (LP gases). The others include butane, propylene, butadiene, butylene, isobutylene, and mixtures thereof. Propane has lower volumetric energy density, but higher gravimetric energy density and burns more cleanly than gasoline and coal.Propane gas has become a popular choice for barbecues and portable stoves because its low boiling point makes it vaporize as soon as it is released from its pressurized container. Propane powers buses, forklifts, taxis, outboard boat motors, and ice resurfacing machines and is used for heat and cooking in recreational vehicles and campers.'"
pipe(txt)

[{'entity_group': 'TRIVIAL',
  'score': 0.7321303,
  'word': 'Propane',
  'start': 1,
  'end': 8},
 {'entity_group': 'FORMULA',
  'score': 0.7086718,
  'word': 'C3H8',
  'start': 64,
  'end': 68},
 {'entity_group': 'TRIVIAL',
  'score': 0.9166991,
  'word': 'Propane',
  'start': 455,
  'end': 462},
 {'entity_group': 'SYSTEMATIC',
  'score': 0.9988894,
  'word': 'butane',
  'start': 541,
  'end': 547},
 {'entity_group': 'SYSTEMATIC',
  'score': 0.9974094,
  'word': 'propylene',
  'start': 549,
  'end': 558},
 {'entity_group': 'SYSTEMATIC',
  'score': 0.99758244,
  'word': 'butadiene',
  'start': 560,
  'end': 569},
 {'entity_group': 'SYSTEMATIC',
  'score': 0.99358565,
  'word': 'butylene',
  'start': 571,
  'end': 579},
 {'entity_group': 'TRIVIAL',
  'score': 0.9250084,
  'word': 'isobutylene',
  'start': 581,
  'end': 592},
 {'entity_group': 'TRIVIAL',
  'score': 0.8556896,
  'word': 'Propane',
  'start': 616,
  'end': 623},
 {'entity_group': 'TRIVIAL',
  'score': 0.86377996,
  'word'

In [18]:
model.save_pretrained("chemical_NER_model")
tokenizer.save_pretrained("chemical_tokenizer")

('chemical_tokenizer/tokenizer_config.json',
 'chemical_tokenizer/special_tokens_map.json',
 'chemical_tokenizer/vocab.txt',
 'chemical_tokenizer/added_tokens.json',
 'chemical_tokenizer/tokenizer.json')