In [57]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import os, torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("kamalkraj/BioELECTRA-PICO")
model = AutoModelForTokenClassification.from_pretrained("kamalkraj/BioELECTRA-PICO")

In [3]:
# Set CUDA device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda")
device

device(type='cpu')

In [4]:
model.to(device)

ElectraForTokenClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [5]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

No CUDA runtime is found, using CUDA_HOME='/usr'


In [6]:

example = '''Recent evidence suggests that critically ill patients are able to tolerate lower levels of
haemoglobin than was previously believed. It is our goal to show that transfusing to a level of 100
g/l does not improve mortality and other clinically important outcomes in a critical care setting.
Although many questions remain, many laboratory and clinical studies, including a recent
randomized controlled trial (RCT), have established that transfusing to normal haemoglobin
concentrations does not improve organ failure and mortality in the critically ill patient. In addition,
a restrictive transfusion strategy will reduce exposure to allogeneic transfusions, result in more
e cient use of red blood cells (RBCs), save blood overall, and decrease health care costs.'''

ner_results = nlp(example)
print(ner_results)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'I-Participants', 'score': 0.8316009, 'index': 5, 'word': 'critically', 'start': 30, 'end': 40}, {'entity': 'I-Participants', 'score': 0.79285794, 'index': 6, 'word': 'ill', 'start': 41, 'end': 44}, {'entity': 'I-Participants', 'score': 0.77887094, 'index': 7, 'word': 'patients', 'start': 45, 'end': 53}, {'entity': 'I-Outcome', 'score': 0.52259195, 'index': 41, 'word': 'mortality', 'start': 212, 'end': 221}, {'entity': 'I-Participants', 'score': 0.65704894, 'index': 92, 'word': 'critically', 'start': 537, 'end': 547}, {'entity': 'I-Participants', 'score': 0.6281377, 'index': 93, 'word': 'ill', 'start': 548, 'end': 551}, {'entity': 'I-Outcome', 'score': 0.58772475, 'index': 107, 'word': 'allogeneic', 'start': 633, 'end': 643}, {'entity': 'I-Outcome', 'score': 0.6532099, 'index': 131, 'word': 'health', 'start': 745, 'end': 751}, {'entity': 'I-Outcome', 'score': 0.70205945, 'index': 132, 'word': 'care', 'start': 752, 'end': 756}, {'entity': 'I-Outcome', 'score': 0.6884015, 'in

#### Training

In [9]:
import torch
import os
import pandas as pd
import numpy as np
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer

In [2]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [3]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, max_length=512
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [4]:
df_cols = ["pmcid","tokens",'ner_tags']
data_df = pd.DataFrame(columns=df_cols)
data_df

Unnamed: 0,pmcid,tokens,ner_tags


In [5]:
id2label = {0: 'O',
            1: 'B-Patient',
            2: 'I-Patient',
            3: 'B-Intervention',
            4: 'I-Intervention',
            5: 'B-Comparison',
            6: 'I-Comparison',
            7: 'B-Outcome',
            8: 'I-Outcome'}

In [6]:
label2id = {'O': 0,
            'B-Patient': 1,
            'I-Patient': 2,
            'B-Intervention': 3,
            'I-Intervention': 4,
            'B-Comparison': 5,
            'I-Comparison': 6,
            'B-Outcome': 7,
            'I-Outcome': 8}

In [7]:
data_path = '/home/maaz-lfd/Maaz/Thesis/Thesis/dataset/conll_format'
files = os.listdir(data_path)

In [8]:
for i in range(len(files)):
    lines = open(os.path.join(data_path,files[i])).readlines()[1:]
    tokens = []
    ner_tags = []
    for line in lines:
        try:
            tokens.append(line.split()[0])
            ner_tags.append(label2id[line.split()[-1]])
        except IndexError:
            pass
        data_df.loc[i] = [files[i].split('.')[0],tokens,ner_tags]
    


  arr_value = np.asarray(value)


In [9]:
data_df

Unnamed: 0,pmcid,tokens,ner_tags
0,PMC3303828,"[The, prevalence, of, preexisting, immunity, t...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,PMC2875942,"[OBJECTIVES, :, To, determine, the, frequency,...","[0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 0, 1, 2, ..."
2,PMC4336463,"[OBJECTIVES, :, Skin, biopsies, from, local, s...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,PMC1481583,"[BACKGROUND, :, Patients, with, chronic, obstr...","[0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, ..."
4,PMC3438601,"[Antisense, oligonucleotides, (, AOs, ), are, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, ..."
...,...,...,...
474,PMC2747378,"[Recombinant, vesicular, stomatitis, viruses, ...","[3, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 3, 4, 0, ..."
475,PMC3331808,"[BACKGROUND, :, In, 2009, ,, there, was, an, i...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
476,PMC5809586,"[Cold-inducible, RNA-binding, protein, (, CIRP...","[1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, ..."
477,PMC3488323,"[BACKGROUND, :, Despite, bronchiectasis, being...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
dataset = Dataset.from_pandas(data_df)

In [11]:
dataset

Dataset({
    features: ['pmcid', 'tokens', 'ner_tags', '__index_level_0__'],
    num_rows: 479
})

In [12]:
label_names = ['O', 'B-Patient', 'I-Patient', 'B-Intervention', 'I-Intervention', 'B-Comparison', 'I-Comparison', 'B-Outcome', 'I-Outcome']

In [13]:
model_checkpoint = "kamalkraj/BioELECTRA-PICO"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
inputs = tokenizer(dataset[0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'the',
 'prevalence',
 'of',
 'preexisting',
 'immunity',
 'to',
 'adenovirus',
 '##es',
 'in',
 'the',
 'majority',
 'of',
 'the',
 'human',
 'population',
 'might',
 'adversely',
 'impact',
 'the',
 'development',
 'of',
 'adaptive',
 'immune',
 'responses',
 'against',
 'adenovirus',
 'vector',
 '-',
 'based',
 'vaccines',
 '.',
 'to',
 'address',
 'this',
 'issue',
 ',',
 'we',
 'primed',
 'balb',
 '/',
 'c',
 'mice',
 'either',
 'intranasal',
 '##ly',
 '(',
 'i',
 '.',
 'n',
 '.',
 ')',
 'or',
 'intramuscularly',
 '(',
 'i',
 '.',
 'm',
 '.',
 ')',
 'with',
 'varying',
 'doses',
 'of',
 'wild',
 'type',
 '(',
 'wt',
 ')',
 'human',
 'adenovirus',
 'subtype',
 '5',
 '(',
 'had',
 '##5',
 ')',
 '.',
 'following',
 'the',
 'development',
 'of',
 'immunity',
 'against',
 'had',
 '##5',
 ',',
 'we',
 'immunized',
 'animals',
 'via',
 'the',
 'i',
 '.',
 'n',
 '.',
 'or',
 'i',
 '.',
 'm',
 '.',
 'route',
 'of',
 'inoculation',
 'with',
 'a',
 'had',
 'vector',
 '(',
 'had',


In [15]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 25,
 25,
 26,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 39,
 40,
 41,
 41,
 41,
 41,
 42,
 43,
 44,
 45,
 46,
 46,
 46,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 78,
 78,
 78,
 79,
 80,
 80,
 80,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 89,
 89,
 89,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 130,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 148,
 148,
 148,
 148,
 148,
 149,
 150,
 151,
 151,
 151,
 152,


In [16]:
labels = data_df["ner_tags"][0]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0,

In [17]:
dataset.column_names

['pmcid', 'tokens', 'ner_tags', '__index_level_0__']

In [18]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset.column_names,
    
)

Map:   0%|          | 0/479 [00:00<?, ? examples/s]

In [19]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 479
})

In [20]:
tokenized_datasets['attention_mask']

[[1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [22]:
data_collator

DataCollatorForTokenClassification(tokenizer=ElectraTokenizerFast(name_or_path='kamalkraj/BioELECTRA-PICO', vocab_size=28895, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [23]:
# batch = data_collator([tokenized_datasets[i] for i in range(2)])
# batch["labels"]
batch = data_collator([tokenized_datasets[300]])
batch["labels"]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    3,    4,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    2,    2,
            2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    3,    4,
            4,    4,    4,    4,    4,    4,    4,    3,    4,    4,    4,    4,
            4,    4,    4,    4,    4,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [24]:
print(tokenized_datasets[300]["labels"])
# for i in range(2):
#     print(tokenized_datasets[i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [25]:
metric = evaluate.load("seqeval")

In [26]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [27]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/BioELECTRA-PICO and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
model.config.num_labels

9

In [29]:
# Set CUDA device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda")
device

device(type='cuda')

In [30]:
model.to(device)

ElectraForTokenClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [48]:
args = TrainingArguments(
    "BioELECTRA-PICO",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    auto_find_batch_size=True
)

In [49]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 479
})

In [50]:
tokenized_datasets['token_type_ids'][0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [51]:
for i in tokenized_datasets:
    print(len(i['input_ids']),'\t',len(i['labels']))

300 	 300
329 	 329
344 	 344
400 	 400
269 	 269
119 	 119
342 	 342
512 	 512
386 	 386
452 	 452
256 	 256
323 	 323
363 	 363
160 	 160
134 	 134
459 	 459
229 	 229
179 	 179
205 	 205
254 	 254
295 	 295
182 	 182
264 	 264
512 	 512
483 	 483
194 	 194
226 	 226
261 	 261
83 	 83
432 	 432
332 	 332
179 	 179
339 	 339
302 	 302
255 	 255
248 	 248
262 	 262
309 	 309
170 	 170
336 	 336
246 	 246
512 	 512
157 	 157
201 	 201
318 	 318
301 	 301
242 	 242
304 	 304
339 	 339
47 	 47
280 	 280
147 	 147
335 	 335
135 	 135
328 	 328
296 	 296
210 	 210
330 	 330
445 	 445
291 	 291
269 	 269
247 	 247
376 	 376
204 	 204
444 	 444
247 	 247
410 	 410
375 	 375
327 	 327
382 	 382
275 	 275
512 	 512
140 	 140
331 	 331
198 	 198
169 	 169
161 	 161
203 	 203
287 	 287
150 	 150
360 	 360
273 	 273
99 	 99
317 	 317
124 	 124
252 	 252
412 	 412
234 	 234
365 	 365
512 	 512
191 	 191
219 	 219
363 	 363
185 	 185
256 	 256
512 	 512
193 	 193
293 	 293
424 	 424
388 	 388
295 	 

In [52]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets,
    # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()



  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/1200 [00:00<?, ?it/s]

  0%|          | 0/2400 [00:00<?, ?it/s]

  0%|          | 0/4790 [00:00<?, ?it/s]

{'loss': 0.3644, 'learning_rate': 1.791231732776618e-05, 'epoch': 1.04}
{'loss': 0.2696, 'learning_rate': 1.582463465553236e-05, 'epoch': 2.09}
{'loss': 0.1831, 'learning_rate': 1.373695198329854e-05, 'epoch': 3.13}
{'loss': 0.1416, 'learning_rate': 1.1649269311064719e-05, 'epoch': 4.18}
{'loss': 0.1023, 'learning_rate': 9.561586638830899e-06, 'epoch': 5.22}
{'loss': 0.0819, 'learning_rate': 7.473903966597078e-06, 'epoch': 6.26}
{'loss': 0.0624, 'learning_rate': 5.3862212943632574e-06, 'epoch': 7.31}
{'loss': 0.0499, 'learning_rate': 3.2985386221294363e-06, 'epoch': 8.35}
{'loss': 0.0354, 'learning_rate': 1.210855949895616e-06, 'epoch': 9.39}
{'train_runtime': 3422.6036, 'train_samples_per_second': 1.4, 'train_steps_per_second': 1.4, 'train_loss': 0.13722140087215287, 'epoch': 10.0}


TrainOutput(global_step=4790, training_loss=0.13722140087215287, metrics={'train_runtime': 3422.6036, 'train_samples_per_second': 1.4, 'train_steps_per_second': 1.4, 'train_loss': 0.13722140087215287, 'epoch': 10.0})

In [53]:
trainer.save_model('/home/maaz-lfd/Maaz/Thesis/Thesis/save_models/pico-ner')

In [46]:
res = trainer.predict(tokenized_datasets)

  0%|          | 0/60 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
res

PredictionOutput(predictions=array([[[ 2.7236292 , -1.1061649 , -1.031594  , ..., -1.1722089 ,
         -0.02199265,  2.2092273 ],
        [ 5.749409  , -0.89272594, -0.5913928 , ..., -1.4684955 ,
         -0.9917571 , -0.01220852],
        [ 5.6747622 , -0.7985093 , -0.5939327 , ..., -1.460507  ,
         -0.92879856, -0.07738012],
        ...,
        [ 1.8968238 , -1.3324113 , -1.0375075 , ..., -1.4756284 ,
         -0.17471163,  4.559887  ],
        [ 2.4041777 , -1.1916469 , -1.0613022 , ..., -1.3754526 ,
          0.14743075,  3.6156957 ],
        [ 1.4191749 , -1.138825  , -1.0084865 , ..., -1.4619763 ,
         -0.28672287,  4.643552  ]],

       [[ 2.9786217 , -0.91554725, -0.64481723, ..., -1.0924237 ,
         -0.32712197,  1.8466179 ],
        [ 5.5884166 , -1.0554539 , -0.6437091 , ..., -1.1428392 ,
         -1.0165137 , -0.38863176],
        [ 5.5788426 , -1.204653  , -0.5672829 , ..., -1.0133213 ,
         -0.9139675 , -0.400725  ],
        ...,
        [ 0.8327971 , -1.

In [109]:
torch.cuda.is_available()

True

#### Inferencing

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
import spacy


2023-11-09 23:46:57.609337: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-09 23:47:08.771694: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-11-09 23:47:08.772183: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2023-11-09 23:47:24.748392: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but

In [2]:
tokenizer = AutoTokenizer.from_pretrained('/home/maaz-lfd/Maaz/Thesis/Thesis/save_models/pico-ner')
model = AutoModelForTokenClassification.from_pretrained('/home/maaz-lfd/Maaz/Thesis/Thesis/save_models/pico-ner')

In [3]:
# Set CUDA device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda")
device

device(type='cuda')

In [4]:
model.to(device)

ElectraForTokenClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [5]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer,device=device)

In [6]:
def get_entities_html(text, ner_result, title=None):
  """Visualize NER with the help of SpaCy"""
  ents = []
  for ent in ner_result:
    e = {}
    # add the start and end positions of the entity
    e["start"] = ent["start"]
    e["end"] = ent["end"]
    # add the score if you want in the label
    # e["label"] = f"{ent["entity"]}-{ent['score']:.2f}"
    e["label"] = ent["entity"]
    if ents and -1 <= ent["start"] - ents[-1]["end"] <= 1 and ents[-1]["label"] == e["label"]:
      # if the current entity is shared with previous entity
      # simply extend the entity end position instead of adding a new one
      ents[-1]["end"] = e["end"]
      continue
    ents.append(e)
  # construct data required for displacy.render() method
  render_data = [
    {
      "text": text,
      "ents": ents,
      "title": title,
    }
  ]

  options = {"ents": ['B-Patient', 'I-Patient', 
                      'B-Intervention', 'I-Intervention', 
                      'B-Comparison', 'I-Comparison',
                      'B-Outcome', 'I-Outcome'],
           "colors": {'B-Patient': "rgb(235,12,0)", 'I-Patient': "rgb(235,12,0)",
                      'B-Intervention':"rgb(83,211,13)", 'I-Intervention':"rgb(83,211,13)",
                      'B-Comparison':"rgb(62,124,249)", 'I-Comparison':"rgb(62,124,249)",
                      'B-Outcome':"rgb(244,198,16)", 'I-Outcome':"rgb(244,198,16)"}}
  spacy.displacy.render(render_data, style="ent", manual=True, jupyter=True,options=options)

In [7]:

text = '''In newborn dairy calves, it has been demonstrated that supranutritional maternal and colostral Se
supplementation using Se yeast or sodium selenite, respectively, improves passive transfer of
IgG. In beef cattle, agronomic bioforti cation with Se is a more practical alternative for Se
supplementation, whereby the Se concentration of hay is increased through the use of Se-
containing fertilizer amendments. It has been previously demonstrated that agronomic Se
bioforti cation is an e ective strategy to improve immunity and performance in Se-replete weaned
beef calves. The objective of this experiment was to determine the e ects of feeding beef cows
Se-enriched alfalfa (Medicago sativa) hay during the last 8 to 12 wk of gestation on passive
transfer of antibodies to calves. At 10 wk ± 16 d before calving, 45 cows were assigned to 1 of 3
treatment groups with 3 pens (5 cows/pen) per treatment: Control cows were fed non-Se-forti ed
alfalfa hay plus a mineral supplement containing 120 mg/kg Se from sodium selenite, Med-Se
cows were fed alfalfa hay fertilized with 45.0 g Se/ha as sodium selenate, and High-Se cows were
fed alfalfa hay fertilized with 89.9 g Se/ha as sodium selenate; both the Med-Se and the High-Se
groups received mineral supplement without added Se. Colostrum and whole blood (WB) were
collected from cows at calving, and WB was collected from calves within 2 h of calving and at 12,
24, 36, and 48 h of age. Concentrations of IgG1 and J-5 Escherichia coli antibody in cow
colostrum and calf serum were quanti ed using ELISA procedures. Selenium concentrations
linearly increased in WB (P < 0.001) and colostrum (P < 0.001) of cows and in WB of newborn
calves (P < 0.001) with increasing Se concentration in alfalfa hay. Colostrum concentrations of
IgG1 (P = 0.03) were increased in cows fed Se-bioforti ed alfalfa hay, but J-5 E. coli antibody (P =
0.43) concentrations were not. Calf serum IgG1 (P = 0.43) and J-5 E. coli antibody (P = 0.44)
concentrations during the rst 48 h of age were not a ected by prior Se treatment of cows. These
data suggest that feeding Se-bioforti ed alfalfa hay promotes the accumulation of Se and
antibodies in colostrum but does not a ect short-term serum antibody concentrations in calves.'''

ner_results = nlp(text)
print(ner_results)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-Patient', 'score': 0.8201429, 'index': 38, 'word': 'beef', 'start': 200, 'end': 204}, {'entity': 'I-Patient', 'score': 0.73676217, 'index': 39, 'word': 'cattle', 'start': 205, 'end': 211}, {'entity': 'B-Intervention', 'score': 0.98218566, 'index': 41, 'word': 'agro', 'start': 213, 'end': 217}, {'entity': 'I-Intervention', 'score': 0.9943019, 'index': 42, 'word': '##no', 'start': 217, 'end': 219}, {'entity': 'I-Intervention', 'score': 0.9946597, 'index': 43, 'word': '##mic', 'start': 219, 'end': 222}, {'entity': 'I-Intervention', 'score': 0.99459696, 'index': 44, 'word': 'biof', 'start': 223, 'end': 227}, {'entity': 'I-Intervention', 'score': 0.99435824, 'index': 45, 'word': '##ort', 'start': 227, 'end': 230}, {'entity': 'I-Intervention', 'score': 0.99386036, 'index': 46, 'word': '##i', 'start': 230, 'end': 231}, {'entity': 'I-Intervention', 'score': 0.99241513, 'index': 47, 'word': 'cation', 'start': 232, 'end': 238}, {'entity': 'I-Intervention', 'score': 0.9801477, 'ind

In [8]:
get_entities_html(text, ner_results)

In [25]:
for i in ner_results:
    print(f"{i['word']}\t{i['entity']}\t{i['score']}")

recombinant	B-Intervention	0.989713191986084
salmonella	I-Intervention	0.9951883554458618
enterica	I-Intervention	0.9963545799255371
serovar	I-Intervention	0.9964113831520081
typhi	I-Intervention	0.9964359998703003
recombinant	B-Intervention	0.9872046709060669
salmonella	I-Intervention	0.9824662208557129
typhi	I-Intervention	0.9949526786804199
strain	I-Intervention	0.9951072931289673
pil	I-Intervention	0.9850533604621887
##s	I-Intervention	0.9965904951095581
##−	I-Intervention	0.9966318011283875
pil	I-Intervention	0.9966247081756592
##t	I-Intervention	0.997021496295929
##−	I-Intervention	0.9971338510513306
##ga	I-Intervention	0.9971906542778015
##g	I-Intervention	0.9973616003990173
+	I-Intervention	0.9971888661384583
(	I-Intervention	0.9968628883361816
pva	I-Intervention	0.9965268969535828
##x1	I-Intervention	0.9969124794006348
-	I-Intervention	0.9972350001335144
gp120	I-Intervention	0.9967451095581055
)	I-Intervention	0.9964239001274109
human	B-Intervention	0.759589433670044
virus	I-I

#### Experiments

In [1]:
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification
import evaluate
import pandas as pd
import os

2023-11-06 02:06:27.463411: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-06 02:07:53.819592: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-11-06 02:07:53.820273: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [18]:
raw_datasets = load_dataset("conll2003")

Found cached dataset conll2003 (/home/maaz-lfd/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [20]:
raw_datasets['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [22]:
raw_datasets['train'][0]['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [28]:
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [7]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [8]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [4]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [5]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [8]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


In [6]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
tokenizer.is_fast

True

In [7]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [8]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [23]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [24]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [11]:
raw_datasets['train']

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [25]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [26]:
raw_datasets['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [27]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Loading cached processed dataset at /home/maaz-lfd/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-6d589d9132a2412e.arrow
Loading cached processed dataset at /home/maaz-lfd/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-db96438342c93caa.arrow
Loading cached processed dataset at /home/maaz-lfd/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98/cache-35bc08eba38b5d41.arrow


In [32]:
tokenized_datasets['train'][0]['labels']

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [19]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [20]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"]
    )

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


In [21]:
metric = evaluate.load("seqeval")

In [22]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [23]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

In [24]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [25]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [27]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [28]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 431c1420-cc10-469d-bca0-245ee7f87115)')' thrown while requesting HEAD https://huggingface.co/bert-base-cased/resolve/main/config.json
Downloading model.safetensors: 100%|██████████| 436M/436M [04:12<00:00, 1.73MB/s] 
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
tokenized_datasets["train"]

NameError: name 'tokenized_datasets' is not defined

In [29]:
model.config.num_labels

9

In [32]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
!huggingface-cli login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: Traceback (most recent call 

In [9]:
# def export_to_file(export_file_path, data):
#     with open(export_file_path, "w") as f:
#         for record in data:
#             ner_tags = record["ner_tags"]
#             tokens = record["tokens"]
#             if len(tokens) > 0:
#                 f.write(
#                     str(len(tokens))
#                     + "\t"
#                     + "\t".join(tokens)
#                     + "\t"
#                     + "\t".join(map(str, ner_tags))
#                     + "\n"
#                 )
# os.mkdir("data")
# export_to_file("./data/conll_train.txt", conll_data["train"])
# export_to_file("./data/conll_val.txt", conll_data["validation"])