In [None]:
import platform
import numpy as np
import pandas as pd
import random
import torch
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

# enable tqdm in pandas
tqdm.pandas()

# select device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif 'arm64' in platform.platform():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

# which transformer to use
transformer_name = 'distilbert-base-cased' # 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

In [None]:

# map labels to the first token in each word
def align_labels(word_ids, labels, label_to_index):
    # default value for CrossEntropyLoss ignore_index parameter
    ignore_index = -100
    
    label_ids = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None or word_id == previous_word_id:
            # ignore if not a word or word id has already been seen
            label_ids.append(ignore_index)
        else:
            # get label id for corresponding word
            label_id = label_to_index[labels[word_id]]
            label_ids.append(label_id)
        # remember this word id
        previous_word_id = word_id
    
    return label_ids
            
# build a set of labels in the dataset            
def read_label_set(fn):
    labels = set()
    with open(fn) as f:
        for index, line in enumerate(f):
            line = line.strip()
            tokens = line.split()
            if tokens != []:
                label = tokens[-1]
                labels.add(label)
    return labels

# converts a two-column file in the basic MTL format ("word \t label") into a dataframe
def read_dataframe(fn):
    
    labels = read_label_set(fn)
    index_to_label = {i:t for i,t in enumerate(labels)}
    label_to_index = {t:i for i,t in enumerate(labels)}
    print("index_to_label: ", index_to_label)
    
    # now build the actual dataframe for this dataset
    data = {'words': [], 'labels': [], 'token_ids': [], 'word_ids': [], 'token_labels': []}
    with open(fn) as f:
        sent_words = []
        sent_labels = [] 
        for index, line in tqdm(enumerate(f)):
            line = line.strip()
            tokens = line.split()
            if tokens == []:
                data['words'].append(sent_words)
                data['labels'].append(sent_labels)
                
                # tokenize each sentence
                token_input = tokenizer(sent_words, is_split_into_words = True)  
                token_ids = token_input['input_ids']
                word_ids = token_input.word_ids(batch_index = 0)
                
                # map labels to the first token in each word
                token_labels = align_labels(word_ids, sent_labels, label_to_index)
                
                data['token_ids'].append(token_ids)
                data['word_ids'].append(word_ids)
                data['token_labels'].append(token_labels)
                sent_words = []
                sent_labels = [] 
            else:
                sent_words.append(tokens[0])
                sent_labels.append(tokens[1])
    return pd.DataFrame(data)


In [None]:
dataframe = read_dataframe("data/conll-ner/train_small.txt")
dataframe