In [1]:
import numpy as np
import pandas as pd
import random
import torch
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cpu
random seed: 1234


In [2]:
def read_dataframe(fn):
    data = {'words': [], 'labels': []}
    with open(fn) as f:
        sent_words = []
        sent_labels = [] 
        for index, line in enumerate(f):
            line = line.strip()
            tokens = line.split()
            if tokens == []:
                data['words'].append(sent_words)
                data['labels'].append(sent_labels)
                sent_words = []
                sent_labels = []
            else:
                sent_words.append(tokens[0])
                sent_labels.append(tokens[1])
    return pd.DataFrame(data)


In [3]:
dataframe = read_dataframe("data/conll-ner/train_small.txt")

In [4]:
dataframe

Unnamed: 0,words,labels
0,[-DOCSTART-],[O]
1,"[EU, rejects, German, call, to, boycott, Briti...","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]"
2,"[Peter, Blackburn]","[B-PER, I-PER]"
3,"[BRUSSELS, 1996-08-22]","[B-LOC, O]"
4,"[The, European, Commission, said, on, Thursday...","[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O,..."
5,"[Germany, 's, representative, to, the, Europea...","[B-LOC, O, O, O, O, B-ORG, I-ORG, O, O, O, B-P..."
6,"["", We, do, n't, support, any, such, recommend...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"[He, said, further, scientific, study, was, re...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
8,"[He, said, a, proposal, last, month, by, EU, F...","[O, O, O, O, O, O, O, B-ORG, O, O, B-PER, I-PE..."
9,"[Fischler, proposed, EU-wide, measures, after,...","[B-PER, O, B-MISC, O, O, O, O, B-LOC, O, B-LOC..."


In [5]:
from transformers import AutoTokenizer

transformer_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_name)