In [249]:
from transformers import RobertaModelWithHeads
from transformers import RobertaTokenizer
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

model = RobertaModelWithHeads.from_pretrained("roberta-base")
model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
model.trainable = False
model.train_adapter("AdapterHub/roberta-base-pf-drop", True)

# Set Adpater and NER head as trainable
model.add_classification_head('ner_head', num_labels=13)
model.set_classification_head_trainable('ner_head',True)


tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

ValueError: No adapter with name 'AdapterHub/roberta-base-pf-drop' found. Please make sure that all specified adapters are correctly loaded.

In [234]:
tokens = tokenizer("Hello, I'm Mr. Johnson, and I live at 123 Park Avenue.", return_tensors="pt")
tokenized = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0], skip_special_tokens=True)
tokenized = [token[1:] if token.startswith('Ġ') else token for token in tokenized]
tokenized

['Hello',
 ',',
 'I',
 "'m",
 'Mr',
 '.',
 'Johnson',
 ',',
 'and',
 'I',
 'live',
 'at',
 '123',
 'Park',
 'Avenue',
 '.']

In [235]:
tokenized = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0], skip_special_tokens=True)
tokenized = [token[1:] if token.startswith('Ġ') else token for token in tokenized]
tokenized

BOI = "Mr:B-TITLE,Johnson:B-NAME,123:B-ADDRESS,Park:I-ADDRESS,Avenue:I-ADDRESS"
BOI = BOI.split(',')
BOI = dict([x.split(':') for x in BOI])

labels = []
for ind,x in enumerate(tokenized):
    if x in BOI:
        # print(BOI[x])
        labels.append(BOI[x])
    else:
        labels.append("O")

print(tokenized)
print(labels)

['Hello', ',', 'I', "'m", 'Mr', '.', 'Johnson', ',', 'and', 'I', 'live', 'at', '123', 'Park', 'Avenue', '.']
['O', 'O', 'O', 'O', 'B-TITLE', 'O', 'B-NAME', 'O', 'O', 'O', 'O', 'O', 'B-ADDRESS', 'I-ADDRESS', 'I-ADDRESS', 'O']


In [236]:
label_to_ID = {
    "O": 0,
    "B-TITLE": 1,
    "I-TITLE": 2,
    "B-NAME": 3,
    "I-NAME": 4,
    "B-ADDRESS": 5,
    "I-ADDRESS": 6,
    "B-CITY": 7,
    "I-CITY": 8,
    "B-COUNTRY": 9,
    "I-COUNTRY": 10,
    "B-ARITHMETIC": 11,
    "I-ARITHMETIC": 12,
}

In [237]:
def labelizeData(dataLine):
    prompt,BOI = dataLine.split('||')

    tokenized = tokenizer(prompt, return_tensors="pt", truncation=True, padding='max_length', max_length=512, is_split_into_words=True)
    tokenized_words = tokenizer.convert_ids_to_tokens(tokenized['input_ids'][0], skip_special_tokens=True)
    tokenized_words = [token[1:] if token.startswith('Ġ') else token for token in tokenized_words]

    BOI = BOI.replace('\n',"")
    BOI = BOI.split(',')
    BOI = dict([x.split(':') for x in BOI])
    # labels = [BOI[x] if x in BOI else "O" for ind,x in enumerate(tokenized_words)]
    labels = [label_to_ID[BOI[x]] if x in BOI else 0 for x in tokenized_words]
    return tokenized, labels

res = labelizeData('''My name is Mrs. Emily Watson, I reside at 22 Baker Street, London." || "Mrs.:B-TITLE,Emily:B-NAME,Watson:I-NAME,22:B-ADDRESS,Baker:I-ADDRESS,Street:I-ADDRESS,London:I-ADDRESS
''')
res[0],res[1]

({'input_ids': tensor([[    0,  2387,   766,    16,  3801,     4,  7770,  5399,     6,    38,
          23773,    23,   820,  5643,   852,     6,   928,    72,  1437,     2,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,   

AttributeError: 'RobertaModelWithHeads' object has no attribute 'set_adapter_trainable'

In [204]:
with open('ner_dataset.txt', 'r') as data:
    dataset = data.readlines()

In [209]:
dataset_tokenized = []
labels_ID = []
for line in dataset:
    # print(line)
    dataset_tokenized.append(labelizeData(line)[0])
    labels_ID.append(labelizeData(line)[1])


In [221]:
dataset_tokenized
# labels_ID

[{'input_ids': tensor([[    0,  2387,   766,    16,  3801,     4,  7770,  5399,     6,    38,
          23773,    23,   820,  5643,   852,     6,   928,    72,  1437,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[    0,   243,    18,  1745,  7393,  7431,    31, 28259,  2930,  5503,
              6,  2201,    72,  1437,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[    0, 46486,   162,    25,  4645, 10670,  8153,     6,     8,    38,
          18742,    31,  3622,     6,  2809,    72,  1437,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[   0,  100,  524,  427,    4,  957, 1259,    4, 1308, 1100,   16,  379,
          6482, 3936,    6,  188,  469,   72, 1437,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': ten

In [225]:
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, dataset_tokenized, labels_ID):
        self.tokenized_sentences = dataset_tokenized
        self.labels = labels_ID

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_sentences[idx]["input_ids"],
            "attention_mask": self.tokenized_sentences[idx]["attention_mask"],
            "labels": self.labels[idx]
        }


In [227]:
ner_dataset = NERDataset(dataset_tokenized, labels_ID)
dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True)

In [228]:
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
model.train()

epochs = 5
for epoch in range(epochs):
    for batch in dataloader:
        inputs = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        optimizer.zero_grad()

        outputs = model(inputs, attention_mask=attention_mask, labels=labels, task_name='ner_head')
        loss = outputs[0]
        loss.backward()

        optimizer.step()

RuntimeError: stack expects each tensor to be equal size, but got [1, 18] at entry 0 and [1, 14] at entry 1