# HuggingFace Installations

In [52]:
!pip install datasets
!pip install transformers
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=e41778fc9bbdece68dd3c5a4f5a0b8ad49487be45bf610b0ef22f64391d00ed1
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [67]:
import pandas as pd
import numpy as np
import spacy
import tqdm
import sys
from datasets import Dataset, DatasetDict
from transformers import Trainer
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
from transformers import pipeline

# Loading Data

In [4]:
entities = pd.read_csv("data/entities.tsv", delimiter="\t")
entities.head()

Unnamed: 0,filename,mark,label,offset1,offset2,span,code
0,es-S0212-71992007000100007-1,T1,ENFERMEDAD,40,61,arterial hypertension,38341003
1,es-S0212-71992007000100007-1,T2,ENFERMEDAD,66,79,polyarthrosis,36186002
2,es-S0212-71992007000100007-1,T3,ENFERMEDAD,1682,1698,pleural effusion,60046008
3,es-S0212-71992007000100007-1,T4,ENFERMEDAD,1859,1875,pleural effusion,60046008
4,es-S0212-71992007000100007-1,T5,ENFERMEDAD,1626,1648,lower lobe atelectasis,46621007


In [5]:
list_off0 = list(entities['offset1'])
list_off1 = list(entities['offset2'])

In [6]:
text_files_path = "data/text"

In [7]:
f = open(text_files_path + "/" + entities.iloc[1,0] + ".txt", "r", encoding="UTF-8")
for l in f:
  print(l)

A 73-year-old patient with a history of arterial hypertension and polyarthrosis presented to the emergency department with abdominal distension and pain associated with constipation and febrile fever. The symptoms had started three weeks earlier and worsened during the four days prior to admission. During this period, an upper gastrointestinal fibroendoscopy (oesophagus, stomach and duodenum) and a colonoscopy (up to the splenic angle) were performed, but no abnormalities were found.

Physical examination revealed a low-grade fever (37.6º C), a distended abdomen, diffusely painful on palpation, tympanised on percussion, with scant borborygmi but no evidence of peritonism, pulmonary auscultation with decreased ventilation in the lower half of the right hemithorax and the onset of intense pain on palpation and percussion of the last three dorsal spinous processes.

Analyses showed 8.2 x 109 leukocytes / L, haemoglobin 136 g / L, platelets 186 x 109 / L. Except for glycaemia (123 mg/dl), 

In [8]:
#Clinical cases
HCs = {}
for fid in tqdm.tqdm(range(len(entities["filename"]))):
  fname = entities["filename"][fid]
  with open(text_files_path + "/" + fname + ".txt", "r", encoding="UTF-8") as f:
    HCs.update({fname: f.read()})

100%|██████████| 6650/6650 [00:42<00:00, 155.75it/s]


In [9]:
#Diseases
ENF = {}
enfermedades = []
fn = entities["filename"][0]
for fname, enf in zip(entities["filename"], entities["span"]):
    if fname!=fn:
      enfermedades = []
    enfermedades.append(enf)
    ENF.update({fname: enfermedades})
    fn = fname

In [10]:
len(ENF)

741

# Preprocessing

In [11]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 2.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [12]:
nlp = spacy.load("en_core_web_sm")

In [13]:
HCs_tokenized = []
for hc in HCs:
    hl = []
    tokens = nlp(HCs[hc])
    #tokens = HCs[hc].split(" ") #The simplest option
    for t in tokens:
        hl.append(str(t))
    HCs_tokenized.append(hl)

In [14]:
len(HCs_tokenized)

741

In [15]:
Ent_tokenized = []
for enf in ENF:
    Tks = []
    for e in ENF[enf]:
      sl = []
      tokens = nlp(e)
      #tokens = e.split(" ")
      for t in tokens:
          sl.append(str(t))
      Tks.append(sl)
    Ent_tokenized.append(Tks)

In [16]:
len(Ent_tokenized)

741

In [17]:
Ent_tokenized[0]

[['arterial', 'hypertension'],
 ['polyarthrosis'],
 ['pleural', 'effusion'],
 ['pleural', 'effusion'],
 ['lower', 'lobe', 'atelectasis'],
 ['infectious', 'spondylodiscitis', 'D10', '-', 'D11'],
 ['pleural', 'effusion']]

# Tagging Data with BIO scheme

In [18]:
def find_idx(list_to_check, item_to_find):
    indices = []
    for idx, value in enumerate(list_to_check):
        if value == item_to_find:
            indices.append(idx)
    return indices

In [19]:
labels_tokenized = []
idx =-1
for hct, et in zip(HCs_tokenized, Ent_tokenized):
    idx+=1
    labels = []
    for i in range(len(hct)):
        #Labels: 0->'O'; 1->'B'; 2->'I'
        #labels.append('O')
        labels.append(0)

    #For Entities (Diseases|Enfermedades)
    for enf in et:
      first = True
      for e in enf:
          if first == True:
              try:
                #labels[hct.index(e)] = 'B'
                #labels[posLab] = 'B'
                indices = find_idx(hct, e)
                if len(indices) > 1:
                  for id in indices:
                      labels[id] = 1
                else:
                  labels[hct.index(e)] = 1
                
                first = False
              except:
                first = False
                if e == "sarcoma+carcinoma" or e == "carcinoma+sarcoma":
                  continue
                print(hct)
                print(et)
                print(enf)
                print(e)
                print(idx)
          else:
              try:
                #labels[hct.index(e)] = 'I'
                #labels[posLab] = 'I'
                indices = find_idx(hct, e)
                if len(indices) > 1:
                  for id in indices:
                      if labels[id-1] != 0:
                        labels[id] = 2
                else:
                  labels[hct.index(e)] = 2
              except:
                if e == "sarcoma+carcinoma" or e == "carcinoma+sarcoma":
                  continue
                print(hct)
                print(et)
                print(enf)
                print(e)
                print(idx)

    labels_tokenized.append(labels)

['A', '43', '-', 'year', '-', 'old', 'man', 'was', 'admitted', 'to', 'the', 'emergency', 'department', 'due', 'to', 'sudden', 'left', 'lumbar', 'pain', ',', 'continuous', 'and', 'incapacitating', ',', 'without', 'antalgic', 'position', 'or', 'aggravating', 'factors', ',', 'without', 'irradiation', ',', 'with', 'approximately', '23', 'hours', 'of', 'evolution', '.', 'No', 'nausea', 'or', 'vomiting', ',', 'no', 'macroscopic', 'haematuria', 'or', 'lower', 'urinary', 'tract', 'discomfort', '.', 'Absence', 'of', 'precordial', 'pain', '.', 'Hypertension', 'controlled', 'with', 'verapamil', '.', 'He', 'reported', 'an', 'episode', ',', 'interpreted', 'as', 'a', 'transient', 'ischaemic', 'attack', ',', 'approximately', 'eight', 'weeks', 'earlier', '(', 'not', 'confirmed', ')', '.', 'No', 'history', 'of', 'cardiac', 'arrhythmia', 'or', 'valvular', 'heart', 'disease', '.', 'No', 'other', 'previous', 'thromboembolic', 'episodes', '.', 'No', 'known', 'history', 'of', 'urinary', 'lithiasis', '.', 'N

In [20]:
j = 0
for i in range(len(HCs_tokenized[j])):
  print(str(HCs_tokenized[j][i]) + "\t" + str(labels_tokenized[j][i]))

A	0
73	0
-	0
year	0
-	0
old	0
patient	0
with	0
a	0
history	0
of	0
arterial	1
hypertension	2
and	0
polyarthrosis	1
presented	0
to	0
the	0
emergency	0
department	0
with	0
abdominal	0
distension	0
and	0
pain	0
associated	0
with	0
constipation	0
and	0
febrile	0
fever	0
.	0
The	0
symptoms	0
had	0
started	0
three	0
weeks	0
earlier	0
and	0
worsened	0
during	0
the	0
four	0
days	0
prior	0
to	0
admission	0
.	0
During	0
this	0
period	0
,	0
an	0
upper	0
gastrointestinal	0
fibroendoscopy	0
(	0
oesophagus	0
,	0
stomach	0
and	0
duodenum	0
)	0
and	0
a	0
colonoscopy	0
(	0
up	0
to	0
the	0
splenic	0
angle	0
)	0
were	0
performed	0
,	0
but	0
no	0
abnormalities	0
were	0
found	0
.	0

	0
Physical	0
examination	0
revealed	0
a	0
low	0
-	0
grade	0
fever	0
(	0
37.6º	0
C	0
)	0
,	0
a	0
distended	0
abdomen	0
,	0
diffusely	0
painful	0
on	0
palpation	0
,	0
tympanised	0
on	0
percussion	0
,	0
with	0
scant	0
borborygmi	0
but	0
no	0
evidence	0
of	0
peritonism	0
,	0
pulmonary	0
auscultation	0
with	0
decreased	0
ventilation

# Validating tokenization and alignment with the BIO tags.

In [21]:
flag = 0
for st, lt in zip(HCs_tokenized, labels_tokenized):
    if len(st) != len(lt):
        print(st)
        print(lt)
        flag = 1
if flag==0:
    print("Everything is aligned!")

Everything is aligned!


# Sentence tokenization

In [22]:
sent_tokenized = []
label_sent_tokenized = []
for ht, lht in zip(HCs_tokenized, labels_tokenized):
  st = []; lbst = []
  for h, l in zip(ht,lht):
    if h != ".":
      st.append(h)
      lbst.append(l)
    else:
      st.append(".")
      lbst.append(0)
      sent_tokenized.append(st)
      label_sent_tokenized.append(lbst)
      st = []; lbst = []

In [23]:
len(sent_tokenized)

11668

In [24]:
sent_tokenized[0]

['A',
 '73',
 '-',
 'year',
 '-',
 'old',
 'patient',
 'with',
 'a',
 'history',
 'of',
 'arterial',
 'hypertension',
 'and',
 'polyarthrosis',
 'presented',
 'to',
 'the',
 'emergency',
 'department',
 'with',
 'abdominal',
 'distension',
 'and',
 'pain',
 'associated',
 'with',
 'constipation',
 'and',
 'febrile',
 'fever',
 '.']

In [25]:
len(label_sent_tokenized)

11668

In [26]:
label_sent_tokenized[0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

# Disease mentions identification as a Token classification problem

# Building the Dataset

## Case as a whole is given as input

In [27]:
dic = {"tokens": HCs_tokenized, "ner_tags": labels_tokenized} #For the whole clinical case. We used this option for our paper.
#dic = {"tokens": sent_tokenized, "ner_tags": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by ". " b

In [28]:
dataset = Dataset.from_dict(dic)

In [29]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 741
})

In [30]:
#For training, validation, and test partitions
"""
#Train, val, test partitions
train_test = dataset.train_test_split()
test_val = train_test['test'].train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test']
    })
"""

#Just for training and validation partitions
train_test = dataset.train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test']
    })

In [31]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 555
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 186
    })
})

In [32]:
raw_datasets["train"][0]["ner_tags"]
#raw_datasets["train"][0]["pos_tags"]
#raw_datasets["train"][0]["chunk_tags"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [33]:
raw_datasets['train']

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 555
})

In [34]:
label_names = ['O','B','I']
label_names

['O', 'B', 'I']

In [35]:
words = raw_datasets["train"][0]["tokens"]
labels = [int(n) for n in raw_datasets["train"][0]["ner_tags"]]
#labels = raw_datasets["train"][0]["pos_tags"]
#labels = raw_datasets["train"][0]["chunk_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

This is an 18 year old female patient , diagnosed ( DSM - IV - TR ) with social phobia and dependent personality disorder , referred from the Child and Adolescent Mental Health Service ( CSMIJ ) for follow - up in the adult service , who comes accompanied by her mother . 
 She is the eldest of two sisters . Parents separated . Lives with her mother and sister . During her childhood she shows attention deficit and poor academic performance without repeating a grade . She is currently studying a higher module and is working in the family business . Medical history of interest : isolated growth retardation that required treatment with growth hormone ( GH ) from the age of 12 to 17 years . Fructose intolerance associated with digestive bleeding and lactose intolerance . In childhood , genetic examination was carried out , and no genetic alteration , neither numerical nor structural , was observed . She denies the use of toxic substances . Among the family psychiatric antecedents , the moth

# Loading BERT as a pre-trained model

In [36]:
model_checkpoint = "d4data/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Downloading:   0%|          | 0.00/373 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [37]:
tokenizer.is_fast

True

In [38]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

Token indices sequence length is longer than the specified maximum sequence length for this model (1250 > 512). Running this sequence through the model will result in indexing errors


['[CLS]',
 'this',
 'is',
 'an',
 '18',
 'year',
 'old',
 'female',
 'patient',
 ',',
 'diagnosed',
 '(',
 'ds',
 '##m',
 '-',
 'iv',
 '-',
 'tr',
 ')',
 'with',
 'social',
 'ph',
 '##ob',
 '##ia',
 'and',
 'dependent',
 'personality',
 'disorder',
 ',',
 'referred',
 'from',
 'the',
 'child',
 'and',
 'adolescent',
 'mental',
 'health',
 'service',
 '(',
 'cs',
 '##mi',
 '##j',
 ')',
 'for',
 'follow',
 '-',
 'up',
 'in',
 'the',
 'adult',
 'service',
 ',',
 'who',
 'comes',
 'accompanied',
 'by',
 'her',
 'mother',
 '.',
 'she',
 'is',
 'the',
 'eldest',
 'of',
 'two',
 'sisters',
 '.',
 'parents',
 'separated',
 '.',
 'lives',
 'with',
 'her',
 'mother',
 'and',
 'sister',
 '.',
 'during',
 'her',
 'childhood',
 'she',
 'shows',
 'attention',
 'deficit',
 'and',
 'poor',
 'academic',
 'performance',
 'without',
 'repeating',
 'a',
 'grade',
 '.',
 'she',
 'is',
 'currently',
 'studying',
 'a',
 'higher',
 'module',
 'and',
 'is',
 'working',
 'in',
 'the',
 'family',
 'business',
 '

In [39]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [40]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 

In [41]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [42]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [43]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [44]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,  ...,    0,    0, -100],
        [-100,    0,    0,  ...,    0,    0, -100]])

In [45]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [46]:
model = AutoModelForTokenClassification.from_pretrained(    
    model_checkpoint
)

Downloading:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/266M [00:00<?, ?B/s]

In [47]:
args = TrainingArguments(
    "NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=7,
    weight_decay=0.01
)

In [62]:
metric = load_metric("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    try:
      true_predictions = [
          [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
          for prediction, label in zip(predictions, labels)
      ]
    except:
      print(predictions)
      print(label)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Training BERT for Disease mentions identification

In [63]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 555
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 490


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.30969,0.433993,0.469093,0.450861,0.945976
2,No log,0.314895,0.404505,0.497978,0.446401,0.943576
3,No log,0.291618,0.427013,0.47487,0.449672,0.946642
4,No log,0.303323,0.446686,0.447718,0.447201,0.946742
5,No log,0.316931,0.432964,0.496245,0.46245,0.946114
6,No log,0.321666,0.462073,0.461005,0.461538,0.947747
7,No log,0.327836,0.45584,0.462161,0.458979,0.948225


***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-70
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-70/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-70/pytorch_model.bin
tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-70/tokenizer_config.json
Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-70/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 186
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-140
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased/checkpoint-140/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_b

TrainOutput(global_step=490, training_loss=0.005486757414681571, metrics={'train_runtime': 100.574, 'train_samples_per_second': 38.628, 'train_steps_per_second': 4.872, 'total_flos': 508316106100968.0, 'train_loss': 0.005486757414681571, 'epoch': 7.0})

In [64]:
trainer.save_model('model/model-1')

Saving model checkpoint to model/model-1
Configuration saved in model/model-1/config.json
Model weights saved in model/model-1/pytorch_model.bin
tokenizer config file saved in model/model-1/tokenizer_config.json
Special tokens file saved in model/model-1/special_tokens_map.json


In [65]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 186
  Batch size = 8


(186, 512, 84) (186, 512)


In [66]:
i=0
print(raw_datasets["validation"][i]['tokens'])
for j in range(len(preds[i])):
  print(raw_datasets["validation"][i]['ner_tags'][j], "\t", preds[i][j])
print(' '.join(raw_datasets["validation"][i]['tokens']))

['We', 'present', 'the', 'case', 'of', 'a', '29', '-', 'year', '-', 'old', 'male', '(', '165', 'cm', ',', '68', 'kg', ')', ',', 'Afghan', ',', 'police', 'officer', ',', 'who', 'suffered', 'an', 'open', 'traumatism', 'to', 'the', 'right', 'elbow', 'during', 'an', 'attack', 'secondary', 'to', 'the', 'impact', 'of', 'a', 'firearm', '(', 'probably', 'AK-74', '7.62', 'mm', ')', '.', 'Following', 'the', 'incident', ',', 'a', 'tourniquet', 'was', 'placed', 'at', 'humeral', 'level', ',', 'a', 'granulated', 'haemostatic', 'was', 'applied', 'topically', '(', 'Celox', '®', 'SAM', 'Medical', 'Products', ',', 'Newport', ',', 'Oregon', ',', 'USA', ')', 'and', 'tranexamic', 'acid', '(', '1', 'g', 'iv', ')', 'was', 'administered', '.', 'He', 'was', 'evacuated', 'by', 'medical', 'helicopter', 'to', 'the', 'Spanish', 'Role', '2E', 'in', 'Herat', '(', 'Afghanistan', ')', 'arriving', 'at', 'the', 'triage', 'room', '70', 'minutes', 'after', 'sustaining', 'the', 'injury', '.', '\n\n', 'On', 'primary', 'asse

In [68]:
#Replace this with your own checkpoint. If you have run all the previous cells successfully, the model should be available at your hugging face account with the name: NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased
model_checkpoint = 'model/model-1'
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

loading configuration file model/model-1/config.json
Model config DistilBertConfig {
  "_name_or_path": "model/model-1",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-Activity",
    "2": "B-Administration",
    "3": "B-Age",
    "4": "B-Area",
    "5": "B-Biological_attribute",
    "6": "B-Biological_structure",
    "7": "B-Clinical_event",
    "8": "B-Color",
    "9": "B-Coreference",
    "10": "B-Date",
    "11": "B-Detailed_description",
    "12": "B-Diagnostic_procedure",
    "13": "B-Disease_disorder",
    "14": "B-Distance",
    "15": "B-Dosage",
    "16": "B-Duration",
    "17": "B-Family_history",
    "18": "B-Frequency",
    "19": "B-Height",
    "20": "B-History",
    "21": "B-Lab_value",
    "22": "B-Mass",
    "23": "B-Medication",
    "24": "B-Non[biological](Detailed_description",
    "25": "B-Nonbiological_

In [71]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 186
  Batch size = 8


(186, 512, 84) (186, 512)


In [75]:
preds

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

# Sentence Based Modelling

In [76]:
dic = {"tokens": sent_tokenized, "ner_tags": label_sent_tokenized} #Use this option if you want to check the model performance with sentences tokenized by ". " b

In [77]:
dataset = Dataset.from_dict(dic)

In [78]:
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 11668
})

In [79]:
#For training, validation, and test partitions
"""
#Train, val, test partitions
train_test = dataset.train_test_split()
test_val = train_test['test'].train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test']
    })
"""

#Just for training and validation partitions
train_test = dataset.train_test_split()
raw_datasets = DatasetDict({
    'train': train_test['train'],
    'validation': train_test['test']
    })

In [80]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 8751
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2917
    })
})

In [81]:
raw_datasets["train"][0]["ner_tags"]
#raw_datasets["train"][0]["pos_tags"]
#raw_datasets["train"][0]["chunk_tags"]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [82]:
raw_datasets['train']

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 8751
})

In [83]:
label_names = ['O','B','I']
label_names

['O', 'B', 'I']

In [84]:
words = raw_datasets["train"][0]["tokens"]
labels = [int(n) for n in raw_datasets["train"][0]["ner_tags"]]
#labels = raw_datasets["train"][0]["pos_tags"]
#labels = raw_datasets["train"][0]["chunk_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Since 2006 she had tried several times to lose weight , without success . 
O     O    O   O   O     O       O     O  O    O      O O       O       O 


In [85]:
model_checkpoint = "d4data/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/tokenizer_config.json


In [86]:
tokenizer.is_fast

True

In [87]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'since',
 '2006',
 'she',
 'had',
 'tried',
 'several',
 'times',
 'to',
 'lose',
 'weight',
 ',',
 'without',
 'success',
 '.',
 '[SEP]']

In [88]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [89]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [90]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [91]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [92]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [93]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            1,    2,    2,    2,    2,    2,    2,    2,    0, -100]])

In [94]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [95]:
model = AutoModelForTokenClassification.from_pretrained(    
    model_checkpoint
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--d4data--biomedical-ner-all/snapshots/e87917020da1384aed6e93b1b46d68771f65ddab/config.json
Model config DistilBertConfig {
  "_name_or_path": "d4data/biomedical-ner-all",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-Activity",
    "2": "B-Administration",
    "3": "B-Age",
    "4": "B-Area",
    "5": "B-Biological_attribute",
    "6": "B-Biological_structure",
    "7": "B-Clinical_event",
    "8": "B-Color",
    "9": "B-Coreference",
    "10": "B-Date",
    "11": "B-Detailed_description",
    "12": "B-Diagnostic_procedure",
    "13": "B-Disease_disorder",
    "14": "B-Distance",
    "15": "B-Dosage",
    "16": "B-Duration",
    "17": "B-Family_history",
    "18": "B-Frequency",
    "19": "B-Height",
    "20": "B-History",
    "21": "

In [97]:
args = TrainingArguments(
    "NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=7,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [98]:
metric = load_metric("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    try:
      true_predictions = [
          [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
          for prediction, label in zip(predictions, labels)
      ]
    except:
      print(predictions)
      print(label)
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [99]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 8751
  Num Epochs = 7
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7658


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1525,0.155313,0.470948,0.318676,0.38013,0.944582
2,0.1027,0.153319,0.478896,0.457838,0.46813,0.95006
3,0.0616,0.164578,0.485082,0.487843,0.486459,0.949786
4,0.0382,0.225899,0.490498,0.494051,0.492268,0.951071
5,0.0204,0.255147,0.452337,0.510605,0.479708,0.949585


***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-1094
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-1094/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-1094/pytorch_model.bin
tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-1094/tokenizer_config.json
Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-1094/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-2188
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoi

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1525,0.155313,0.470948,0.318676,0.38013,0.944582
2,0.1027,0.153319,0.478896,0.457838,0.46813,0.95006
3,0.0616,0.164578,0.485082,0.487843,0.486459,0.949786
4,0.0382,0.225899,0.490498,0.494051,0.492268,0.951071
5,0.0204,0.255147,0.452337,0.510605,0.479708,0.949585
6,0.0136,0.268314,0.480081,0.492499,0.48621,0.951123
7,0.0067,0.314802,0.485128,0.489395,0.487252,0.951366


***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-6564
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-6564/config.json
Model weights saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-6564/pytorch_model.bin
tokenizer config file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-6564/tokenizer_config.json
Special tokens file saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-6564/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoint-7658
Configuration saved in NLP-CIC-WFU_DisTEMIST_fine_tuned_bert-base-multilingual-cased-sentence/checkpoi

TrainOutput(global_step=7658, training_loss=0.060400033921789456, metrics={'train_runtime': 321.1595, 'train_samples_per_second': 190.737, 'train_steps_per_second': 23.845, 'total_flos': 1133303869684008.0, 'train_loss': 0.060400033921789456, 'epoch': 7.0})

In [100]:
trainer.save_model('model/model-2-sentence')

Saving model checkpoint to model/model-2-sentence
Configuration saved in model/model-2-sentence/config.json
Model weights saved in model/model-2-sentence/pytorch_model.bin
tokenizer config file saved in model/model-2-sentence/tokenizer_config.json
Special tokens file saved in model/model-2-sentence/special_tokens_map.json
