In [20]:
import os
import tensorflow_datasets as tfds
import datasets
from transformers import AutoTokenizer

from utils_display import pc

# conll2003 dataset

In [26]:
path_to_conll2003_dataset = os.path.join("local_datasets", "conll2003")
dataset_train = datasets.load_from_disk(os.path.join(path_to_conll2003_dataset, "train.hf"))

In [3]:
pos_tags2indices = {
    '"': 0, "''": 1, '#': 2, '$': 3, '(': 4, ')': 5, ',': 6, '.': 7, ':': 8, '``': 9, 'CC': 10, 'CD': 11, 'DT': 12,
    'EX': 13, 'FW': 14, 'IN': 15, 'JJ': 16, 'JJR': 17, 'JJS': 18, 'LS': 19, 'MD': 20, 'NN': 21, 'NNP': 22, 'NNPS': 23,
    'NNS': 24, 'NN|SYM': 25, 'PDT': 26, 'POS': 27, 'PRP': 28, 'PRP$': 29, 'RB': 30, 'RBR': 31, 'RBS': 32, 'RP': 33,
    'SYM': 34, 'TO': 35, 'UH': 36, 'VB': 37, 'VBD': 38, 'VBG': 39, 'VBN': 40, 'VBP': 41, 'VBZ': 42, 'WDT': 43,
    'WP': 44, 'WP$': 45, 'WRB': 46
}

chunk_tags2indices = {
    'O': 0, 'B-ADJP': 1, 'I-ADJP': 2, 'B-ADVP': 3, 'I-ADVP': 4, 'B-CONJP': 5, 'I-CONJP': 6, 'B-INTJ': 7, 'I-INTJ': 8,
    'B-LST': 9, 'I-LST': 10, 'B-NP': 11, 'I-NP': 12, 'B-PP': 13, 'I-PP': 14, 'B-PRT': 15, 'I-PRT': 16, 'B-SBAR': 17,
    'I-SBAR': 18, 'B-UCP': 19, 'I-UCP': 20, 'B-VP': 21, 'I-VP': 22
}

ner_tags2indices = {
    'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8
}

In [4]:
def create_dico_indices2tags(dico_tags2indices: dict) -> dict:
    dico_indices2tags = dict()
    for key in dico_tags2indices:
        dico_indices2tags[dico_tags2indices[key]] = key
    return dico_indices2tags

In [5]:
pos_indices2tags = create_dico_indices2tags(dico_tags2indices=pos_tags2indices)
chunk_indices2tags = create_dico_indices2tags(dico_tags2indices=chunk_tags2indices)
ner_indices2tags = create_dico_indices2tags(dico_tags2indices=ner_tags2indices)

In [10]:
def print_sample(sample) -> None:

    print("-"*74)
    print("{:<4} | {:<20} | {:<3} {:<10} | {:<3} {:<10} | {:<3} {:<10}".format(
        "INDEX", "TOKEN", "", "POS", "", "CHUNK", "", "NER"))
    print("-"*74)
    for index in range(len(sample["tokens"])):
        
        pos_index = sample["pos"][index]
        pos_tag = pos_indices2tags[pos_index]
    
        chunk_index = sample["chunks"][index]
        chunk_tag = chunk_indices2tags[chunk_index]
    
        ner_index = sample["ner"][index]
        ner_tag = ner_indices2tags[ner_index]    
        
        print("{:<5} | {:<20} | {:<3} {:<10} | {:<3} {:<10} | {:<3} {:<10}".format(
            index,
            sample["tokens"][index],
            pos_index,
            pos_tag,
            chunk_index,
            chunk_tag,
            ner_index,
            ner_tag
        ))    
    print("-"*74)

In [37]:
sample_index = 245
pc("Sample index", sample_index, break_line=True)

sample = dataset_train[sample_index]
for key in sample.keys():
    pc(key, sample[key])
print_sample(sample=sample)

[34mSample index[0m: 245

[34mindex[0m: 245
[34mpos[0m: [22, 38, 12, 21, 15, 12, 16, 21, 35, 37, 22, 38, 40, 12, 16, 21, 15, 12, 16, 21, 10, 30, 22, 27, 21, 7]
[34mchunks[0m: [11, 21, 11, 12, 13, 11, 12, 12, 21, 22, 11, 21, 22, 11, 12, 12, 13, 11, 12, 12, 0, 0, 11, 11, 12, 0]
[34mner[0m: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0]
[34mtokens[0m: ['Ischinger', 'said', 'the', 'threat', 'of', 'a', 'major', 'assault', 'to', 'take', 'Grozny', 'had', 'been', 'the', 'unauthorised', 'initiative', 'of', 'the', 'commanding', 'general', 'and', 'not', 'Moscow', "'s", 'intention', '.']
--------------------------------------------------------------------------
INDEX | TOKEN                |     POS        |     CHUNK      |     NER       
--------------------------------------------------------------------------
0     | Ischinger            | 22  NNP        | 11  B-NP       | 1   B-PER     
1     | said                 | 38  VBD        | 21  B-VP       

In [38]:
model_checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [44]:
def tokenize_labels(sample):
    tokenized_inputs = tokenizer(sample["tokens"], truncation=True, padding=True, is_split_into_words=True)
    tokenizer_inputs[pos""] = sample[""]
    return tokenized_inputs

In [45]:
tokenized_dataset = dataset_train.map(
    tokenize_labels,
    batched=True,
    remove_columns=dataset_train.column_names
)

Map:   0%|          | 0/14042 [00:00<?, ? examples/s]

In [46]:
print(tokenized_dataset[sample_index])

{'input_ids': [101, 2003, 8450, 2121, 2056, 1996, 5081, 1997, 1037, 2350, 6101, 2000, 2202, 24665, 18153, 4890, 2018, 2042, 1996, 14477, 14317, 21239, 2098, 6349, 1997, 1996, 7991, 2236, 1998, 2025, 4924, 1005, 1055, 6808, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [32]:




    
inputs = tokenizer(dataset_train["tokens"][sample_index], is_split_into_words=True)
pos = dataset_train["pos"][0]
word_ids = inputs.word_ids()

print(word_ids)
print(inputs)

[None, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 11, 12, 13, 14, 14, 14, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23, 24, 25, None]
{'input_ids': [101, 2003, 8450, 2121, 2056, 1996, 5081, 1997, 1037, 2350, 6101, 2000, 2202, 24665, 18153, 4890, 2018, 2042, 1996, 14477, 14317, 21239, 2098, 6349, 1997, 1996, 7991, 2236, 1998, 2025, 4924, 1005, 1055, 6808, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
