In [None]:
from transformers import RobertaModelWithHeads
from transformers import RobertaTokenizer
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

model = RobertaModelWithHeads.from_pretrained("roberta-base")
model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
model.train_adapter("drop", True)

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Set Adapter and NER head as trainable
model.add_classification_head('ner_head', num_labels=13)
for param in model.heads['ner_head'].parameters():
    param.requires_grad = True

model.to(device)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [375]:
input_text = ["What we feeling outside"]
# Tokenize input
# encoding = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
encoding = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=97)
inputs = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)

# Forward pass
outputs = model(input_ids=inputs, attention_mask=attention_mask, output_attentions=False)
logits = outputs.logits
print(logits)


tensor([[-0.2721, -0.3810,  0.0575,  0.0949, -0.0121, -0.2263,  0.2611,  0.0819,
          0.0600, -0.1109,  0.3534,  0.0623, -0.0934]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


In [None]:
label_to_ID = {
    "O": 0,
    "B-TITLE": 1,
    "I-TITLE": 2,
    "B-NAME": 3,
    "I-NAME": 4,
    "B-ADDRESS": 5,
    "I-ADDRESS": 6,
    "B-CITY": 7,
    "I-CITY": 8,
    "B-COUNTRY": 9,
    "I-COUNTRY": 10,
    "B-ARITHMETIC": 11,
    "I-ARITHMETIC": 12,
}

In [464]:
from transformers import RobertaTokenizerFast
tokenizerF = RobertaTokenizerFast.from_pretrained('roberta-base')

def labelizeData(dataLine):
    prompt,BOI = dataLine.split('||')

    # tokenized = tokenizer(prompt, return_tensors="pt", truncation=True, padding='max_length', max_length=256, is_split_into_words=True)
    # tokenized = tokenizerF(prompt, return_tensors="pt", truncation=True, padding='max_length', max_length=97)
    tokenized = tokenizer(prompt, return_tensors="pt", return_offsets_mapping=True)
    tokenized_words = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0])
    # tokenized_words = tokenizer.convert_ids_to_tokens(tokenized['input_ids'][0], skip_special_tokens=True)
    # tokenized_words = [token[1:] if token.startswith('Ġ') else token for token in tokenized_words]
    print(tokenized_words)
    BOI = BOI.replace('\n',"")
    BOI = BOI.replace('"',"")
    BOI = BOI.replace('.',"")
    BOI = BOI.strip()
    BOI = BOI.split(',')
    BOI = dict([x.split(':') for x in BOI])
    # labels = [BOI[x] if x in BOI else "O" for ind,x in enumerate(tokenized_words)]
    labels = [label_to_ID[BOI[x]] if x in BOI else 0 for x in tokenized_words]
    labels += [0] * (97 - len(labels))
    return tokenized, labels

res = labelizeData('''My happily name is Mrs. Emily Watson, I reside at 22 Baker Street, London." || "Mrs.:B-TITLE,Emily:B-NAME,Watson:I-NAME,22:B-ADDRESS,Baker:I-ADDRESS,Street:I-ADDRESS,London:I-ADDRESS''')
res[0],res[1]

['<s>', 'My', 'Ġhappily', 'Ġname', 'Ġis', 'ĠMrs', '.', 'ĠEmily', 'ĠWatson', ',', 'ĠI', 'Ġreside', 'Ġat', 'Ġ22', 'ĠBaker', 'ĠStreet', ',', 'ĠLondon', '."', 'Ġ', '</s>']


({'input_ids': tensor([[    0,  2387, 16534,   766,    16,  3801,     4,  7770,  5399,     6,
             38, 23773,    23,   820,  5643,   852,     6,   928,    72,  1437,
              2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'offset_mapping': tensor([[[ 0,  0],
          [ 0,  2],
          [ 3, 10],
          [11, 15],
          [16, 18],
          [19, 22],
          [22, 23],
          [24, 29],
          [30, 36],
          [36, 37],
          [38, 39],
          [40, 46],
          [47, 49],
          [50, 52],
          [53, 58],
          [59, 65],
          [65, 66],
          [67, 73],
          [73, 75],
          [76, 76],
          [ 0,  0]]])},
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [465]:
from transformers import RobertaTokenizerFast

# Initialize the fast tokenizer
tokenizerF = RobertaTokenizerFast.from_pretrained('roberta-base')

# Your sentence
sentence = "My Mekael happily name is Mrs. Emily Watson, I reside at 22 Baker Street, London."

# Tokenize the text
tokenized_input = tokenizerF(sentence)  # Do not use is_split_into_words if `sentence` is a single string
tokens = tokenizerF.convert_ids_to_tokens(tokenized_input["input_ids"], skip_special_tokens=False)
tokens

['<s>',
 'My',
 'ĠMek',
 'ael',
 'Ġhappily',
 'Ġname',
 'Ġis',
 'ĠMrs',
 '.',
 'ĠEmily',
 'ĠWatson',
 ',',
 'ĠI',
 'Ġreside',
 'Ġat',
 'Ġ22',
 'ĠBaker',
 'ĠStreet',
 ',',
 'ĠLondon',
 '.',
 '</s>']

In [511]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

sentence = "Madame Charlotte Wilson here, from 33 Maple Drive, Vienna."

# Example sentence
sentence = "Madame Charlotte Wilson here, from 33 Maple Drive, Vienna."

# Tokenize the text and get word IDs (word_ids will indicate which word each token corresponds to)
tokenized_input = tokenizer(sentence)

# Print out the tokens and corresponding word IDs for examination
for token, word_id in zip(tokenized_input.tokens(), tokenized_input.word_ids()):
    print(f"{token}: {word_id}")

# From the output, you'll be able to see which words are split into subtokens
# and which punctuation marks are treated as separate tokens.

# Let's assume that upon examination, you find that "Madame" is split and "," is treated as separate.
# Adjust the word-level labels accordingly. If a word is split into subtokens, each subtoken should
# have the same label as the original word. Punctuation typically receives an "O" label.

# Revised word-level labels with additional "O" for the punctuation
word_labels = ["B-PER", "B-PER", "I-PER", "I-PER", "O", "O", "O", "O", "B-ADD", "I-ADD", "I-ADD", "I-ADD", "O"]


# Check if the number of tokens that correspond to words matches the number of labels
word_ids = tokenized_input.word_ids()
actual_word_tokens = [word_id for word_id in word_ids if word_id is not None]

if len(actual_word_tokens) != len(word_labels):
    print(f"Number of word tokens: {len(actual_word_tokens)}")
    print(f"Number of labels: {len(word_labels)}")
    raise ValueError("The number of word-level labels does not match the number of word tokens.")

# Label map to convert labels to numerical IDs
label_map = {"O": 0, "B-PER": 1, "I-PER": 2, "B-ADD": 3, "I-ADD": 4}

# Convert word-level labels to numerical labels
numerical_labels = [label_map[label] for label in word_labels]

# Initialize a list to keep our aligned labels
aligned_labels = []

# Iterate over the tokens and their corresponding word IDs
for word_id in word_ids:
    if word_id is None:
        aligned_labels.append(-100)
    else:
        aligned_labels.append(numerical_labels[word_id])

# Combine the tokens and their labels
token_label_pairs = [(token, label) for token, label in zip(tokens, aligned_labels)]

# Print the tokens and their labels
for token, label in token_label_pairs:
    print(f"{token}: {label}")


<s>: None
Mad: 0
ame: 0
ĠCharlotte: 1
ĠWilson: 2
Ġhere: 3
,: 4
Ġfrom: 5
Ġ33: 6
ĠMaple: 7
ĠDrive: 8
,: 9
ĠVienna: 10
.: 11
</s>: None
<s>: -100
Mad: 1
ame: 1
ĠCharlotte: 1
ĠWilson: 2
Ġhere: 2
,: 0
Ġfrom: 0
Ġ33: 0
ĠMaple: 0
ĠDrive: 3
,: 4
ĠVienna: 4
.: 4
</s>: -100


In [512]:
# Tokenize the text and get the word IDs
tokenized_input = tokenizer(sentence, is_split_into_words=False, return_tensors="pt", return_offsets_mapping=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])

# Combine the tokens and their labels
token_label_pairs = [(token, label) for token, label in zip(tokens, aligned_labels)]

# Print them out
for pair in token_label_pairs:
    print(pair)

('<s>', -100)
('Mad', 1)
('ame', 1)
('ĠCharlotte', 1)
('ĠWilson', 2)
('Ġhere', 2)
(',', 0)
('Ġfrom', 0)
('Ġ33', 0)
('ĠMaple', 0)
('ĠDrive', 3)
(',', 4)
('ĠVienna', 4)
('.', 4)
('</s>', -100)


In [398]:
with open('ner_dataset.txt', 'r') as data:
    dataset = data.readlines()

In [461]:
dataset_tokenized = []
labels_ID = []
for line in dataset:

    x = labelizeData(line)[0]
    dataset_tokenized.append(x)
    # print(x.word_ids())
    word_labels = labelizeData(line)[1]
    aligned_labels = align_labels_with_tokens(word_labels, x.word_ids())
    # print(x)
    print(aligned_labels)
    labels_ID.append(aligned_labels)



['<s>', 'My', 'Ġname', 'Ġis', 'ĠMrs', '.', 'ĠEmily', 'ĠWatson', ',', 'ĠI', 'Ġreside', 'Ġat', 'Ġ22', 'ĠBaker', 'ĠStreet', ',', 'ĠLondon', '."', 'Ġ', '</s>']
['<s>', 'My', 'Ġname', 'Ġis', 'ĠMrs', '.', 'ĠEmily', 'ĠWatson', ',', 'ĠI', 'Ġreside', 'Ġat', 'Ġ22', 'ĠBaker', 'ĠStreet', ',', 'ĠLondon', '."', 'Ġ', '</s>']


TypeError: align_labels_with_tokens() missing 1 required positional argument: 'label_map'

In [None]:
xL = labels_ID

In [400]:
len(dataset_tokenized)
# len(labels_ID)

1579

In [463]:
# dataset_tokenized
# xL = [x + [0] * (97 - len(x)) for x in xL]
# xL
# labels_ID = xL

for ind,x in enumerate(dataset_tokenized):
    y = tokenizer.convert_ids_to_tokens(x["input_ids"][0])
    for i in zip(y, labels_ID[ind]):
        print(f'({i[0]}:{i[1]})', end=",")
    print()

IndexError: list index out of range

In [None]:
dataset_tokenized_1 = dataset_tokenized[980:981]

for ind,x in enumerate(dataset_tokenized_1):
    toke = tokenizer.convert_ids_to_tokens(x['input_ids'][0], skip_special_tokens=True)
    print(toke)
    print(labels_ID[ind])
    for t in range(len(toke)):
        print(f'{labels_ID[ind][t]}:,{toke[t]}', end=" ")
    print()

In [None]:
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding

class NERDataset(Dataset):
    def __init__(self, dataset_tokenized, labels_ID, max_length=512):
        self.tokenized_sentences = dataset_tokenized
        self.labels = labels_ID
        self.max_length = max_length

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        # Padding the sequences and labels manually
        input_ids = self.tokenized_sentences[idx]["input_ids"]
        attention_mask = self.tokenized_sentences[idx]["attention_mask"]
        labels = self.labels[idx]
        
        # Padding manually to max_length
        # input_ids = input_ids + [0] * (self.max_length - len(input_ids))
        # attention_mask = attention_mask + [0] * (self.max_length - len(attention_mask))
        labels = labels + [0] * (self.max_length - len(labels))
        
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }


In [None]:
x = torch.tensor([0] * 512)
x = x[None,:]
x.shape

In [None]:
tokenizer.convert_ids_to_tokens

In [None]:
ner_dataset = NERDataset(dataset_tokenized, labels_ID)
# dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

In [None]:
steps = 0
loss_fn = torch.nn.CrossEntropyLoss()
model.to(device)

for batch in dataloader:
        if steps > 1:
                break
        inputs = batch["input_ids"].squeeze(1)
        attention_mask = batch["attention_mask"].squeeze(1)
        labels = batch["labels"]

        batch_size, _ = labels.shape
        labels = labels.view(batch_size, -1)[:, 0]


        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)


        # optimizer.zero_grad()

        outputs = model(inputs, attention_mask=attention_mask, task_name='ner_head', device=device)
        
        # print(tokenizer.convert_ids_to_tokens(outputs[0][0]))
        logits = outputs.logits
        print(logits.shape)
        # predicted_label_ids = torch.argmax(logits, dim=1)
        # print(predicted_label_ids)



        # probabilities = torch.nn.functional.softmax(logits, dim=-1)
        # Print probabilities of the first 5 tokens for inspection
        # print(probabilities[:5])


        # Calculate the loss
        # print(logits.shape)
        # print(labels.shape)
        # loss = loss_fn(logits, labels)

        # loss.backward()

        # optimizer.step()
        steps += 1

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
model.train()

epochs = 5
steps = 0

for epoch in range(epochs):
    for batch in dataloader:
        inputs = batch["input_ids"].squeeze(1)
        attention_mask = batch["attention_mask"].squeeze(1)
        labels = batch["labels"]
        labels=labels.view(32, 512)[:,0]

        inputs.to(device)
        attention_mask.to(device)
        labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs, attention_mask=attention_mask, task_name='ner_head')
        print(outputs[0].shape)
        logits = outputs[0].view(-1, 13)


        # Calculate the loss
        loss_fn = torch.nn.CrossEntropyLoss()
        print(logits.shape)
        print(labels.shape)
        loss = loss_fn(logits, labels)

        loss.backward()

        optimizer.step()
        steps += 1

        if steps % 300 == 0:
            print(f'Step {steps}, Loss: {loss.item()}')

In [None]:
from tqdm import tqdm  # for a progress bar
from torch.nn import CrossEntropyLoss
from transformers import AdamW

device = "cuda" if torch.cuda.is_available() else "cpu"

model = RobertaModelWithHeads.from_pretrained("roberta-base")
model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
model.train_adapter("drop", True)
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

ID_to_label = {
    0: "O",
    1: "B-TITLE",
    2: "I-TITLE",
    3: "B-NAME",
    4: "I-NAME",
    5: "B-ADDRESS",
    6: "I-ADDRESS",
    7: "B-CITY",
    8: "I-CITY",
    9: "B-COUNTRY",
    10: "I-COUNTRY",
    11: "B-ARITHMETIC",
    12: "I-ARITHMETIC"
}

label_to_ID = {
    "O": 0,
    "B-TITLE": 1,
    "I-TITLE": 2,
    "B-NAME": 3,
    "I-NAME": 4,
    "B-ADDRESS": 5,
    "I-ADDRESS": 6,
    "B-CITY": 7,
    "I-CITY": 8,
    "B-COUNTRY": 9,
    "I-COUNTRY": 10,
    "B-ARITHMETIC": 11,
    "I-ARITHMETIC": 12,
}

ner_dataset = NERDataset(dataset_tokenized, labels_ID)
# dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

# Set Adapter and NER head as trainable
model.add_classification_head("ner_head", num_labels=13)

for param in model.heads['ner_head'].parameters():
    param.requires_grad = True

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

num_epochs = 3
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = CrossEntropyLoss().to(device)
model = model.to(device)

for epoch in range(num_epochs):
    total_loss = 0
    model.train()  # set the model to training mode
    
    for batch in tqdm(dataloader):  # assuming dataloader is your DataLoader
        # Move batch tensors to the same device as the model
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        print(input_ids.shape)  # Expected: [32, 512]
        print(attention_mask.shape)  # Expected: [32, 512]
        print(labels.shape)  # Expected: [32, 512]
        print(logits.shape)  # Expected: [32, 512]


        # Reshape labels and logits for loss function
        loss = loss_function(logits.view(-1, logits.shape[-1]), labels.view(-1))
        print(logits.shape)  # should print [32, 512, 13]

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs} | Loss: {total_loss/len(dataloader)}")


In [None]:
from transformers import RobertaForTokenClassification, RobertaTokenizer, AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AdapterConfig
from transformers import RobertaModelWithHeads

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=13).to(device)
# model = RobertaModelWithHeads.from_pretrained("roberta-base",num_labels=13).to(device)

# model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
# model.train_adapter("drop")
# model.add_classification_head("ner_head", num_labels=13, adapter_name="drop")


# Define label mappings
# ... (Your label_to_ID and ID_to_label mappings remain unchanged)

# Prepare dataset and dataloader
# ... (Your NERDataset and DataLoader code remains unchanged)

# Initialize optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = CrossEntropyLoss().to(device)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    model.train()  # set the model to training mode
    
    for batch in tqdm(dataloader):  # assuming dataloader is your DataLoader
        # Move batch tensors to the same device as the model
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].squeeze(1).to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Compute loss
        loss = loss_function(logits.view(-1, logits.shape[-1]), labels.view(-1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs} | Loss: {total_loss/len(dataloader)}")


In [None]:
model.eval()  # set the model to evaluation mode

# Sample input text
text = "John Doe lives in New York."

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

# Get the model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Convert predicted token IDs to labels
predicted_labels = [ID_to_label[id.item()] for id in predictions[0]]

print(predicted_labels)
