In [1]:
from huggingface_utils.labels import IOB as iob
from huggingface_utils.labels import LabelAligner

**Create a IOB formatted mapper**

In [2]:
# for named entity recognition task, we change B-<tag> into I-<tag>
ner_label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# for pos tagging task, do nothing, returning an empty dictionary
pos_label_names = ['"',"''",'#','$','(',')',',','.',':','``','CC','CD','DT','EX','FW','IN','JJ',
'JJR','JJS','LS','MD','NN','NNP','NNPS','NNS','NN|SYM','PDT','POS','PRP','PRP$','RB','RBR','RBS',
'RP','SYM','TO','UH','VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WP$','WRB']

# for trunk classification task, we change B-<tag> into I-<tag>
chunk_label_names = ['O','B-ADJP','I-ADJP','B-ADVP','I-ADVP','B-CONJP','I-CONJP','B-INTJ','I-INTJ',
'B-LST','I-LST','B-NP','I-NP','B-PP','I-PP','B-PRT','I-PRT','B-SBAR','I-SBAR','B-UCP','I-UCP','B-VP','I-VP']

# use mapper creation functions -> str2str
ner_mapper = iob.create_iob_str_mapper(ner_label_names)
chunk_mapper = iob.create_iob_str_mapper(chunk_label_names)
pos_mapper = iob.create_iob_str_mapper(pos_label_names)

# use mapper creation functions -> int2int
ner_label_mapper = iob.create_iob_label_mapper(ner_label_names)
chunk_label_mapper = iob.create_iob_label_mapper(chunk_label_names)
pos_label_mapper = iob.create_iob_label_mapper(pos_label_names)

print("Ner Mapper")
print(ner_mapper)
print(ner_label_mapper)

print("\nChunk Mapper")
print(chunk_mapper)
print(chunk_label_mapper)

print("\nPos Mapper")
print(pos_mapper)
print(pos_label_mapper)

Ner Mapper
{'B-PER': 'I-PER', 'B-ORG': 'I-ORG', 'B-LOC': 'I-LOC', 'B-MISC': 'I-MISC'}
{1: 2, 3: 4, 5: 6, 7: 8}

Chunk Mapper
{'B-ADJP': 'I-ADJP', 'B-ADVP': 'I-ADVP', 'B-CONJP': 'I-CONJP', 'B-INTJ': 'I-INTJ', 'B-LST': 'I-LST', 'B-NP': 'I-NP', 'B-PP': 'I-PP', 'B-PRT': 'I-PRT', 'B-SBAR': 'I-SBAR', 'B-UCP': 'I-UCP', 'B-VP': 'I-VP'}
{1: 2, 3: 4, 5: 6, 7: 8, 9: 10, 11: 12, 13: 14, 15: 16, 17: 18, 19: 20, 21: 22}

Pos Mapper
{}
{}


**Label Align simple case** - Map labels of special token to -100

In [3]:
# Test Simple Case
labels =     [       1,  0,      1,    1]
word_ids =   [None,  0,  1,  1,  2,    3,  None]
want_label = [-100,  1,  0,  0,  1,    1,  -100]

aligner = LabelAligner()
print("Simple Case Test:")
print("Expected: ", want_label)
print("Got     : ", aligner(labels, word_ids))

Simple Case Test:
Expected:  [-100, 1, 0, 0, 1, 1, -100]
Got     :  [-100, 1, 0, 0, 1, 1, -100]


**Label Align IOB case** - Map labels of special token to -100 and change subword from `B-<tag>` to `I-<tag>`

In [4]:
# Test IOB case
label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
labels = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0]
word_ids = [None,0,1,1,1,2,3,4,5,5,5,5,5,5,6,7,7,7,8,9,9,10,10,11,12,13,14,15,16,17,None]
want_label = [-100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,8,0,0,0,0,0,0,0,-100]

aligner = LabelAligner(label_names = label_names, use_iob = True)
print("\nIOB Case Test:")
print("Expected: ", want_label)
print("Got     : ", aligner(labels, word_ids))


IOB Case Test:
Expected:  [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, -100]
Got     :  [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, -100]
