In [6]:
from transformers import RobertaModelWithHeads
from transformers import RobertaTokenizer
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

model = RobertaModelWithHeads.from_pretrained("roberta-base")
model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
model.train_adapter("drop", True)

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Set Adapter and NER head as trainable
model.add_classification_head('ner_head', num_labels=13)
for param in model.heads['ner_head'].parameters():
    param.requires_grad = True

model.to(device)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [7]:
with open('ner_dataset.txt', 'r') as data:
    dataset = data.readlines()

    label_to_ID = {
    "O": 0,
    "B-TITLE": 1,
    "I-TITLE": 2,
    "B-NAME": 3,
    "I-NAME": 4,
    "B-ADDRESS": 5,
    "I-ADDRESS": 6,
    "B-CITY": 7,
    "I-CITY": 8,
    "B-COUNTRY": 9,
    "I-COUNTRY": 10,
    "B-ARITHMETIC": 11,
    "I-ARITHMETIC": 12,
}

In [196]:
from transformers import RobertaTokenizerFast
tokenizerF = RobertaTokenizerFast.from_pretrained('roberta-base')

def labelizeData(dataLine):
    prompt,BOI = dataLine.split('||')

    words = prompt.replace('"', "")
    words = words[:len(words)-2]
    words = words.replace(',', "")
    words = words.split(' ')
    # tokenized = tokenizer(prompt, return_tensors="pt", return_offsets_mapping=True)
    # tokenized_words = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0])
    # print(tokenized_words)


    BOI = BOI.replace('\n',"")
    BOI = BOI.replace('"',"")
    # BOI = BOI.replace('.',"")
    BOI = BOI.strip()
    BOI = BOI.split(',')
    BOI = dict([x.split(':') for x in BOI])
    labels = [label_to_ID[BOI[x]] if x in BOI else 0 for x in words]
    
    idx_label = []
    # for x in zip(words, labels):
        # print(x)
        # idx_label.append(labels)

    # print(idx_label)
    tokenized_list = []
    for i in range(len(words)):
        # tokenized = tokenizer(words[i], return_tensors="pt", return_offsets_mapping=True,padding=True, truncation=True, max_length=120)
        tokenized = tokenizer(words[i], return_tensors="pt",padding="max_length", truncation=True, max_length=120)

        tokenized_words = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0], skip_special_tokens=True)
        
        temp_idx = []
        for j in range(len(tokenized_words)):
            if temp_idx and temp_idx[0] % 2 != 0:
                temp_idx.append(labels[i]+1)
            else:
                temp_idx.append(labels[i])
        
        idx_label += (temp_idx)
        tokenized_list += tokenized['input_ids'].tolist()[0]
        # print(tokenized)
        # print(tokenized_words)
        # print(idx_label)

    # labels += [0] * (97 - len(labels))
    idx_label += [0] * (120 - len(idx_label))
    return tokenized_list, idx_label

res = labelizeData('''My name is Mr. Mekael Wasti, I reside at 22 Baker Street, London." || "Mr.:B-TITLE,Mekael:B-NAME,Wasti:I-NAME,22:B-ADDRESS,Baker:I-ADDRESS,Street:I-ADDRESS,London:I-ADDRESS''')
# res = labelizeData('''I would like to know the weather in Beijing, China." || "Beijing:B-CITY,China:B-COUNTRY''')
res[0],res[1]

([0,
  2387,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  13650,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1

In [184]:
# prompt = labelizeData('''My name is Mr. Mekael Wasti, I reside at 22 Baker Street, London." || "Mr.:B-TITLE,Mekael:B-NAME,Wasti:I-NAME,22:B-ADDRESS,Baker:I-ADDRESS,Street:I-ADDRESS,London:I-ADDRESS''')
# sentence = '''Greetings, I'm Sir Thompson and I come from New York City, USA." || "Sir:B-TITLE,Thompson:B-NAME,New:B-ADDRESS,York:I-ADDRESS,City:I-ADDRESS,USA:I-ADDRESS'''
# sentence = '''My name is Mr. Mekael Wasti, I reside at 22 Baker Street, London." || "Mr.:B-TITLE,Mekael:B-NAME,Wasti:I-NAME,22:B-ADDRESS,Baker:I-ADDRESS,Street:I-ADDRESS,London:I-ADDRESS'''
sentence = '''I would like to know the weather in Beijing, China." || "Beijing:B-CITY,China:B-COUNTRY'''
prompt = labelizeData(sentence)
sentence.split('||')
tokenized = tokenizer(sentence[0], return_tensors="pt",padding="max_length", truncation=True, max_length=120)
# tokenized = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0], skip_special_tokens=True)

# tokenized
# len(tokenized["input_ids"][0].tolist())

prompt_ = tokenizer.convert_ids_to_tokens(prompt[0], skip_special_tokens=True)
for x in zip(prompt_, prompt[1]):
    print(x)

len(tokenized["input_ids"][0].tolist()) == len(prompt[1])


('I', 0)
('would', 0)
('like', 0)
('to', 0)
('know', 0)
('the', 0)
('weather', 0)
('in', 0)
('Be', 7)
('ijing', 8)
('China', 9)


True

In [26]:
tokenized_sentence = []
for x in sentence.split(" "):
    # y = tokenizer(x, return_tensors="pt", return_offsets_mapping=True)
    print(x)
    # z = tokenizer.convert_ids_to_tokens(y["input_ids"][0],skip_special_tokens=True)
    # print(z)

It's
Mekael
Wasti
here,
from
33
Maple
Drive,
Vienna.


Tokenize Dataset

In [219]:
dataset_tokenized = []
labels_ID = []
for line in dataset:

    # print(line)
    prompt = labelizeData(line)
    line = line.split('||')
    tokenized = tokenizer(line[0], return_tensors="pt",padding="max_length", truncation=True, max_length=120)
    dataset_tokenized.append(tokenized["input_ids"][0].tolist())
    labels_ID.append(prompt[1])
    # print(tokenized["input_ids"][0].tolist())
    # print(len(tokenized["input_ids"][0].tolist()))


In [220]:
sentence = '''I would like to know the weather in Beijing, China." || "Beijing:B-CITY,China:B-COUNTRY'''
prompt = labelizeData(sentence)
sentence = sentence.split('||')
tokenized = tokenizer(sentence[0], return_tensors="pt",padding="max_length", truncation=True, max_length=120)

prompt_ = tokenizer.convert_ids_to_tokens(prompt[0], skip_special_tokens=True)
for x in zip(prompt_, prompt[1]):
    print(x)
len(tokenized["input_ids"][0].tolist()) == len(prompt[1])


('I', 0)
('would', 0)
('like', 0)
('to', 0)
('know', 0)
('the', 0)
('weather', 0)
('in', 0)
('Be', 7)
('ijing', 8)
('China', 9)


True

In [227]:
# len(dataset_tokenized)
len(labels_ID)

1579

In [None]:
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding

class NERDataset(Dataset):
    def __init__(self, dataset_tokenized, labels_ID, max_length=512):
        self.tokenized_sentences = dataset_tokenized
        self.labels = labels_ID
        self.max_length = max_length

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        # Padding the sequences and labels manually
        input_ids = self.tokenized_sentences[idx]["input_ids"]
        attention_mask = self.tokenized_sentences[idx]["attention_mask"]
        labels = self.labels[idx]
        
        # Padding manually to max_length
        # input_ids = input_ids + [0] * (self.max_length - len(input_ids))
        # attention_mask = attention_mask + [0] * (self.max_length - len(attention_mask))
        labels = labels + [0] * (self.max_length - len(labels))
        
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }


In [9]:
ner_dataset = NERDataset(dataset_tokenized, labels_ID)
# dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

NameError: name 'NERDataset' is not defined

In [10]:
from transformers import RobertaForTokenClassification, RobertaTokenizer, AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AdapterConfig
from transformers import RobertaModelWithHeads

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=13).to(device)
# model = RobertaModelWithHeads.from_pretrained("roberta-base",num_labels=13).to(device)

# model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
# model.train_adapter("drop")
# model.add_classification_head("ner_head", num_labels=13, adapter_name="drop")


# Define label mappings
# ... (Your label_to_ID and ID_to_label mappings remain unchanged)

# Prepare dataset and dataloader
# ... (Your NERDataset and DataLoader code remains unchanged)

# Initialize optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = CrossEntropyLoss().to(device)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    model.train()  # set the model to training mode
    
    for batch in tqdm(dataloader):  # assuming dataloader is your DataLoader
        # Move batch tensors to the same device as the model
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].squeeze(1).to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Compute loss
        loss = loss_function(logits.view(-1, logits.shape[-1]), labels.view(-1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs} | Loss: {total_loss/len(dataloader)}")


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

NameError: name 'dataloader' is not defined

In [None]:
model.eval()  # set the model to evaluation mode

# Sample input text
text = "John Doe lives in New York."

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

# Get the model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Convert predicted token IDs to labels
predicted_labels = [ID_to_label[id.item()] for id in predictions[0]]

print(predicted_labels)
