In [1]:
import os
import numpy as np
import pandas as pd
import evaluate
from transformers import ElectraForPreTraining, ElectraTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification
import torch
from preprocessing import create_datasets, read_alldocs, read_individual_docs


In [2]:
path1 = "../ebm_nlp_2_00/documents/*.tokens"
path2 = "../ebm_nlp_2_00/annotations/aggregated/hierarchical_labels/participants/train_test_gold/*.ann"
path3 = "../ebm_nlp_2_00/annotations/aggregated/hierarchical_labels/outcomes/train_test_gold/*.ann"
path4 = "../ebm_nlp_2_00/annotations/aggregated/hierarchical_labels/interventions/train_test_gold/*.ann"

participant_labels = ['O', 'PAR_AGE', 'PAR_SEX', 'PAR_SAMP_SIZE', 'PAR_COND']
new_label = ['O', 'PAR_AGE', 'PAR_SEX', 'PAR_SAMP_SIZE', 'PAR_COND', 
             'INT_SURG','INT_PHYS', 'INT_DRUG', 'INT_EDU', 'INT_PSY', 'INT_OTHER', 
              'OUT_PHYS', 'OUT_PAIN', 'OUT_MORT', 'OUT_ADV_EFF', 'OUT_MENT', 'OUT_OTHER']

partcipant_data = read_individual_docs('participants')
# all_doc = read_alldocs(path1, path2, path3, path4)





Reading 0 files for participants


In [3]:
partcipant_ds = create_datasets(partcipant_data, participant_labels)
# all_doc_ds = create_datasets(all_doc, new_label)

Casting the dataset:   0%|          | 0/4798 [00:00<?, ? examples/s]

In [4]:
partcipant_ds.features

{'pmid': Value(dtype='string', id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'PAR_AGE', 'PAR_SEX', 'PAR_SAMP_SIZE', 'PAR_COND'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [5]:
from preprocessing import train_test_split
part_train_test = train_test_split(partcipant_ds, train_test_size=0.1, validation_size=0.5) 
# all_doc_train_test = train_test_split(all_doc_ds, train_test_size=0.1, validation_size=0.5) 


    

In [6]:
part_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 4318
    })
    test: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 240
    })
    valid: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 240
    })
})

In [8]:
all_doc_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'tokens', 'ner_tags'],
        num_rows: 4176
    })
    test: Dataset({
        features: ['pmid', 'tokens', 'ner_tags'],
        num_rows: 233
    })
    valid: Dataset({
        features: ['pmid', 'tokens', 'ner_tags'],
        num_rows: 232
    })
})

In [7]:
part_train_test['train'].features['ner_tags'].feature.names


['O', 'PAR_AGE', 'PAR_SEX', 'PAR_SAMP_SIZE', 'PAR_COND']

In [8]:
tokenizer = ElectraTokenizerFast.from_pretrained("kamalkraj/BioELECTRA-PICO", model_max_length = 512)

In [9]:
def tokenize_align_labels(example, label_all_tokens = True):
    # tokenized_input = tokenizer(example['tokens'], truncation = True, is_split_into_words=True, max_length = 450)
    tokenized_input = tokenizer(example['tokens'], truncation = True, is_split_into_words=True)

    labels = []
    for i , label in enumerate(example['ner_tags']):
        word_ids = tokenized_input.word_ids(batch_index=i)
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: 
                label_ids.append(-100)
            elif word_ids!= previous_word_idx:
                label_ids.append(label[word_idx])
            else: 
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_input['labels'] = labels
    return tokenized_input

In [2]:
from datasets import load_dataset
ncbi_disease = load_dataset('ncbi_disease')

In [10]:
ncbi_disease['train'][0]

{'id': '0',
 'tokens': ['Identification',
  'of',
  'APC2',
  ',',
  'a',
  'homologue',
  'of',
  'the',
  'adenomatous',
  'polyposis',
  'coli',
  'tumour',
  'suppressor',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]}

In [44]:
part_train_test

DatasetDict({
    train: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 4318
    })
    test: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 240
    })
    valid: Dataset({
        features: ['pmid', 'ner_tags', 'tokens'],
        num_rows: 240
    })
})

In [27]:
tokenized_dataset = ncbi_disease.map(tokenize_align_labels, batched = True, remove_columns=ncbi_disease['train'].column_names)

Map:   0%|          | 0/5433 [00:00<?, ? examples/s]

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Map:   0%|          | 0/941 [00:00<?, ? examples/s]

In [28]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 924
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 941
    })
})

In [29]:
tokenized_dataset['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [32]:
dis_list = ncbi_disease['train'].features['ner_tags'].feature.names

In [12]:
id2label = {i:label for i, label in enumerate(dis_list) }
label2id = {label:i for i, label in enumerate(dis_list) }

NameError: name 'dis_list' is not defined

In [34]:
id2label

{0: 'O', 1: 'B-Disease', 2: 'I-Disease'}

In [35]:
model = AutoModelForTokenClassification.from_pretrained("kamalkraj/BioELECTRA-PICO", ignore_mismatched_sizes=True, id2label = id2label )

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/BioELECTRA-PICO and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
model.config.num_labels

3

In [37]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    'ncbi_bencmark',
    evaluation_strategy='epoch',
    save_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01
)

In [38]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load('seqeval')

In [39]:
label_list = ncbi_disease['train'].features['ner_tags'].feature.names

def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits= np.argmax(pred_logits, axis = 2)

    prediction = [
                [label_list[eval_preds] for (eval_preds, l ) in zip(prediction, label) if l != -100 ] for prediction, label in zip(pred_logits, labels)
    ]
    true_labels = [
                [label_list[l] for (eval_preds, l ) in zip(prediction, label)  if l != -100 ] for prediction, label in zip(pred_logits, labels)] 

    results = metric.compute(predictions= prediction, references = true_labels)
    return{
    'precision':results['overall_precision'],
    'recall':results['overall_recall'],
    'f1':results['overall_f1'],
    'accuracy':results['overall_accuracy']
    }



In [40]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 924
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 941
    })
})

In [42]:
trainer = Trainer(
    model, 
    args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
    
)

In [43]:
trainer.train()

model.save_pretrained('EBM_ner_model_2.0')
tokenizer.save_pretrained('EBM_tokenizer_2.0')

  0%|          | 0/680 [00:00<?, ?it/s]

{'loss': 0.1194, 'learning_rate': 5.294117647058824e-06, 'epoch': 0.74}


  0%|          | 0/116 [00:00<?, ?it/s]

{'eval_loss': 0.04829097166657448, 'eval_precision': 0.8051001821493625, 'eval_recall': 0.8557599225556631, 'eval_f1': 0.8296574378226186, 'eval_accuracy': 0.9837176690557806, 'eval_runtime': 25.4964, 'eval_samples_per_second': 36.24, 'eval_steps_per_second': 4.55, 'epoch': 1.0}
{'train_runtime': 641.6782, 'train_samples_per_second': 8.467, 'train_steps_per_second': 1.06, 'train_loss': 0.10320049173691694, 'epoch': 1.0}


('EBM_tokenizer_2.0\\tokenizer_config.json',
 'EBM_tokenizer_2.0\\special_tokens_map.json',
 'EBM_tokenizer_2.0\\vocab.txt',
 'EBM_tokenizer_2.0\\added_tokens.json',
 'EBM_tokenizer_2.0\\tokenizer.json')

In [25]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "caching_allocator"

In [26]:
# gc.collect()
torch.cuda.empty_cache()

In [27]:
torch.cuda.reset_accumulated_memory_stats()

In [None]:
torch.cuda.set_device(1)

In [243]:
torch.cuda.current_device()

0

In [273]:
torch.cuda.get_device_name()


'NVIDIA GeForce GTX 1050'

In [274]:
torch.cuda.memory_allocated()


4930233344

In [275]:
torch.cuda.memory_reserved()


5289017344

In [276]:
torch.cuda.memory_cached()




5289017344

In [312]:
from transformers import pipeline
checkpoint = 'C:/Users/Gbadamosi/Documents/Nerd Corner/Master in ds and AI/MSC project/workspace/workspace/mycode/ebm_test_ner/checkpoint-1887'
token_classifer = pipeline('ner', model = checkpoint, aggregation_strategy = 'simple')


token_classifer("We aimed to determine prospectively whether rivastigmine, an inhibitor of acetylcholinesterase and butyrylcholinesterase, provided benefits in patients with and without visual hallucinations in a population with dementia associated with Parkinson's disease (PDD)")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'PAR_COND',
  'score': 0.6910038,
  'word': 'patients with and without visual hallucinations',
  'start': 143,
  'end': 190},
 {'entity_group': 'PAR_COND',
  'score': 0.7989797,
  'word': "dementia associated with parkinson ' s disease ( pdd )",
  'start': 212,
  'end': 262}]