In [None]:
!pip install datasets

In [2]:
from datasets import load_dataset
import ratransformers

# Load dataset
dataset = load_dataset('conll2003')

# copied from https://huggingface.co/datasets/conll2003
pos_tag_to_id = {'"': 0, "''": 1, '#': 2, '$': 3, '(': 4, ')': 5, ',': 6, '.': 7, ':': 8, '``': 9, 'CC': 10, 'CD': 11, 'DT': 12,
 'EX': 13, 'FW': 14, 'IN': 15, 'JJ': 16, 'JJR': 17, 'JJS': 18, 'LS': 19, 'MD': 20, 'NN': 21, 'NNP': 22, 'NNPS': 23,
 'NNS': 24, 'NN|SYM': 25, 'PDT': 26, 'POS': 27, 'PRP': 28, 'PRP$': 29, 'RB': 30, 'RBR': 31, 'RBS': 32, 'RP': 33,
 'SYM': 34, 'TO': 35, 'UH': 36, 'VB': 37, 'VBD': 38, 'VBG': 39, 'VBN': 40, 'VBP': 41, 'VBZ': 42, 'WDT': 43,
 'WP': 44, 'WP$': 45, 'WRB': 46}

id_to_pos_tag = {v: k for k, v in pos_tag_to_id.items()}

Reusing dataset conll2003 (/home/ola/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from transformers import AutoModelForTokenClassification

# Load ratransformer model and tokenizer
ratransformer = ratransformers.RATransformer(
    "dslim/bert-base-NER", 
    relation_kinds=list(pos_tag_to_id),
    model_cls=AutoModelForTokenClassification
)
model = ratransformer.model
tokenizer = ratransformer.tokenizer

In [4]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
from collections import defaultdict

# Construct a map from span in text to POS_TAG
word_relations = defaultdict(dict)
span_init = 0
for tok, pos_tag_id in zip(dataset['train'][0]['tokens'], dataset['train'][0]['pos_tags']):
    span = (span_init, span_init + len(tok))
    word_relations[span][span] = id_to_pos_tag[pos_tag_id]
    span_init = span_init + len(tok + ' ')
word_relations

defaultdict(dict,
            {(0, 2): {(0, 2): 'NNP'},
             (3, 10): {(3, 10): 'VBZ'},
             (11, 17): {(11, 17): 'JJ'},
             (18, 22): {(18, 22): 'NN'},
             (23, 25): {(23, 25): 'TO'},
             (26, 33): {(26, 33): 'VB'},
             (34, 41): {(34, 41): 'JJ'},
             (42, 46): {(42, 46): 'NN'},
             (47, 48): {(47, 48): '.'}})

In [6]:
# encode 
text = " ".join(dataset['train'][0]['tokens'])
encoding = tokenizer(
    text, 
    return_tensors="pt", 
    input_relations=word_relations
)

# forward pass
outputs = model(**encoding)

# get labels ids and convert to label tags
labels = outputs.logits.argmax(-1)
tokens_to_labels = [model.config.id2label[label_id.item()] for label_id in labels[0]]

# print tokens with their predicted NER tags
for i, token_i_map in enumerate(encoding['offset_mapping'][0]):
    span = token_i_map.tolist()
    token = text[span[0]:span[1]]
    if token: # skip special tokens
        print(token, tokens_to_labels[i])

EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
la O
mb O
. O


**Your model is now ready to be trained with relational information in the input!**

Check the standard procedure to train HuggingFace 🤗 models in [here](https://huggingface.co/docs/transformers/training).