In [1]:
!pip install transformers
!pip install datasets
!pip install transformers[torch]
!pip install adapter-transformers
!pip install accelerate>=0.20.1

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
Co

In [118]:
from transformers import RobertaModelWithHeads
from transformers import RobertaTokenizer, AutoTokenizer
import torch
import torch.optim as optim
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

model = RobertaModelWithHeads.from_pretrained("roberta-base")
model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
model.train_adapter("drop", True)

# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Set Adapter and NER head as trainable
model.add_classification_head('ner_head', num_labels=13)
for param in model.heads['ner_head'].parameters():
    param.requires_grad = True

model.to(device)
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer = AutoTokenizer.from_pretrained("51la5/roberta-large-NER")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [119]:
with open('ner_dataset.txt', 'r') as data:
    dataset = data.readlines()

    label_to_ID = {
    "O": 0,
    "B-TITLE": 1,
    "I-TITLE": 2,
    "B-NAME": 3,
    "I-NAME": 4,
    "B-ADDRESS": 5,
    "I-ADDRESS": 6,
    "B-CITY": 7,
    "I-CITY": 8,
    "B-COUNTRY": 9,
    "I-COUNTRY": 10,
    "B-ARITHMETIC": 11,
    "I-ARITHMETIC": 12,
}

ID_to_label = {
    0: "O",
    1: "B-TITLE",
    2: "I-TITLE",
    3: "B-NAME",
    4: "I-NAME",
    5: "B-ADDRESS",
    6: "I-ADDRESS",
    7: "B-CITY",
    8: "I-CITY",
    9: "B-COUNTRY",
    10: "I-COUNTRY",
    11: "B-ARITHMETIC",
    12: "I-ARITHMETIC"
}

In [131]:
from transformers import RobertaTokenizerFast
# tokenizerF = RobertaTokenizerFast.from_pretrained('roberta-base')
tokenizerF = AutoTokenizer.from_pretrained('roberta-base')

def labelizeData(dataLine):
    prompt,BOI = dataLine.split('||')

    words = prompt.replace('"', "")
    words = words[:len(words)-2]
    words = words.replace(',', "")
    words = words.split(' ')
    # tokenized = tokenizer(prompt, return_tensors="pt", return_offsets_mapping=True)
    # tokenized_words = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0])
    # print(tokenized_words)


    BOI = BOI.replace('\n',"")
    BOI = BOI.replace('"',"")
    # BOI = BOI.replace('.',"")
    BOI = BOI.strip()
    BOI = BOI.split(',')
    BOI = dict([x.split(':') for x in BOI])
    labels = [label_to_ID[BOI[x]] if x in BOI else 0 for x in words]

    idx_label = []
    # for x in zip(words, labels):
        # print(x)
        # idx_label.append(labels)

    # print(idx_label)
    tokenized_list = []
    for i in range(len(words)):
        # tokenized = tokenizer(words[i], return_tensors="pt", return_offsets_mapping=True,padding=True, truncation=True, max_length=120)
        tokenized = tokenizer(words[i], return_tensors="pt",padding="max_length", truncation=True, max_length=100)

        tokenized_words = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0], skip_special_tokens=True)

        temp_idx = []
        for j in range(len(tokenized_words)):
            if temp_idx and temp_idx[0] % 2 != 0:
                temp_idx.append(labels[i]+1)
            else:
                temp_idx.append(labels[i])

        idx_label += (temp_idx)
        tokenized_list += tokenized['input_ids'].tolist()[0]
        # print(tokenized)
        # print(tokenized_words)
        # print(idx_label)

    # labels += [0] * (97 - len(labels))
    idx_label += [0] * (100 - len(idx_label))
    return tokenized_list, idx_label

res = labelizeData('''My name is Mr. Mekael Wasti, I reside at 22 Baker Street, London." || "Mr.:B-TITLE,Mekael:B-NAME,Wasti:I-NAME,22:B-ADDRESS,Baker:I-ADDRESS,Street:I-ADDRESS,London:I-ADDRESS''')
# res = labelizeData('''I would like to know the weather in Beijing, China." || "Beijing:B-CITY,China:B-COUNTRY''')
# res[0],res[1]

Tokenize Dataset

In [132]:
dataset_tokenized = []
labels_ID = []
for line in dataset:

    # print(line)
    prompt = labelizeData(line)
    line = line.split('||')
    tokenized = tokenizer(line[0], return_tensors="pt",padding="max_length", truncation=True, max_length=100)
    # dataset_tokenized.append(tokenized["input_ids"][0])
    dataset_tokenized.append(tokenized)
    # print(prompt[1])
    labels_ID.append(prompt[1])
    # print(tokenized["input_ids"][0].tolist())
    # print(len(tokenized["input_ids"][0].tolist()))


In [133]:
print(dataset_tokenized[0]['input_ids'])

tensor([[     0,   2646,   9351,     83, 108084,      5, 140457, 149993,      4,
             87, 157176,     99,   1039, 133840,  15130,      4,   9020,   1242,
              2,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1]])


In [134]:
sentence = '''I would like to know the weather in Beijing, China." || "Beijing:B-CITY,China:B-COUNTRY'''
prompt = labelizeData(sentence)
sentence = sentence.split('||')
tokenized = tokenizer(sentence[0], return_tensors="pt",padding="max_length", truncation=True, max_length=100)

prompt_ = tokenizer.convert_ids_to_tokens(prompt[0], skip_special_tokens=True)
for x in zip(prompt_, prompt[1]):
    print(x)
len(tokenized["input_ids"][0].tolist()) == len(prompt[1])


('▁I', 0)
('▁would', 0)
('▁like', 0)
('▁to', 0)
('▁know', 0)
('▁the', 0)
('▁weather', 0)
('▁in', 0)
('▁Beijing', 7)
('▁China', 9)


True

In [135]:
# len(dataset_tokenized)
len(labels_ID)
labels_ID[1]

[0,
 0,
 0,
 1,
 3,
 4,
 4,
 0,
 5,
 6,
 6,
 6,
 6,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [136]:
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding

class NERDataset(Dataset):
    def __init__(self, dataset_tokenized, labels_ID, max_length=100):
        self.tokenized_sentences = dataset_tokenized
        self.labels = labels_ID
        self.max_length = max_length

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        # Padding the sequences and labels manually
        input_ids = self.tokenized_sentences[idx]["input_ids"]
        attention_mask = self.tokenized_sentences[idx]["attention_mask"]
        labels = self.labels[idx]

        # Padding manually to max_length
        # input_ids = input_ids + [0] * (self.max_length - len(input_ids))
        # attention_mask = attention_mask + [0] * (self.max_length - len(attention_mask))
        labels = labels + [0] * (self.max_length - len(labels))

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }


In [137]:
ner_dataset = NERDataset(dataset_tokenized, labels_ID)
# dataloader = DataLoader(ner_dataset, batch_size=32, shuffle=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(ner_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)

In [138]:
for batch in dataloader:
  print(batch)

  "input_ids": torch.tensor(input_ids, dtype=torch.long),
  "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[[     0,  30607, 101637,   9464,    390,   3217,     58,      2,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1]],

        [[     0,  30607,  28350,    163,   1672,    

In [140]:
from transformers import RobertaForTokenClassification, RobertaTokenizer, AdamW, AutoModelForTokenClassification
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AdapterConfig
from transformers import RobertaModelWithHeads

# Load tokenizer and model
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer = AutoTokenizer.from_pretrained("51la5/roberta-large-NER")
# model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=13).to(device)
model = AutoModelForTokenClassification.from_pretrained(
    'xlm-roberta-large-finetuned-conll03-english',
    num_labels=13,  # Set this to the number of labels in your dataset
    ignore_mismatched_sizes=True  # This allows loading despite mismatched sizes
).to(device)
# model = RobertaModelWithHeads.from_pretrained("roberta-base",num_labels=13).to(device)

# model.load_adapter("AdapterHub/roberta-base-pf-drop", source="hf")
# model.train_adapter("drop")
# model.add_classification_head("ner_head", num_labels=13, adapter_name="drop")


# Define label mappings
# ... (Your label_to_ID and ID_to_label mappings remain unchanged)

# Prepare dataset and dataloader
# ... (Your NERDataset and DataLoader code remains unchanged)

# Initialize optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = CrossEntropyLoss().to(device)

# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    model.train()  # set the model to training mode

    for batch in tqdm(dataloader):  # assuming dataloader is your DataLoader
        # Move batch tensors to the same device as the model
        # print(batch)
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        labels = batch['labels'].squeeze(1).to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Compute loss
        loss = loss_function(logits.view(-1, logits.shape[-1]), labels.view(-1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs} | Loss: {total_loss/len(dataloader)}")


Downloading (…)lve/main/config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large-finetuned-conll03-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([8, 1024]) in the checkpoint and torch.Size([13, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([8]) in the checkpoint and torch.Size([13]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  "input_ids": torch.tensor(input_ids, dtype=torch.long),
  "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
 11%|█         | 21/198 [04:39<39:14, 13.30s/it]


KeyboardInterrupt: 

In [129]:
from transformers import RobertaForTokenClassification, RobertaTokenizer, AdamW

model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=13).to(device)
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

RobertaForTokenClassification(
  (shared_parameters): ModuleDict()
  (roberta): RobertaModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict()
              )
              (key): Linear(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict()
              )
              (value): L

In [42]:
model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [141]:
model.eval()  # set the model to evaluation mode

# Sample input text
# text = "John Doe lives in New York."
# text = "I live in New York and my name is Mr. John Doe."
# text = "Can current weather in Madrid, Spain?"
# text = "What we feeling in Brazil?"
# text = "Give me the sum of 40 and 60."
# text = "I'm Mr. Mekael Wasti and I live at 32 Cornation Street"
text = "I'm Mr. Mekael Wasti and I live in Whitby Ontario"
# text = "It's Mrs. Margaret Price. Address is 100 Birch Lane, Budapest."

# Tokenize the input text
# inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
inputs = tokenizer(text, return_tensors="pt",padding="max_length", truncation=True, max_length=100).to(device)
tokenized_words = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0], skip_special_tokens=True)


# Get the model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Convert predicted token IDs to labels
predicted_labels = [ID_to_label[id.item()] for id in predictions[0]]

print(predicted_labels)

for pair in zip(tokenized_words, predicted_labels):
  print(pair)


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
('▁I', 'O')
("'", 'O')
('m', 'O')
('▁Mr', 'O')
('.', 'O')
('▁Me', 'O')
('ka', 'O')
('el', 'O')
('▁Was', 'O')
('ti', 'O')
('▁and', 'O')
('▁I', 'O')
('▁live', 'O')
('▁in', 'O')
('▁Whit', 'O')
('by', 'O')
('▁Ontario', 'O')


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
# path = "/content/drive/My Drive/My_Models/gpt1"
path = '/content/gdrive/My Drive/thesis-roberta'

# Or for PyTorch, save just the state_dict:
torch.save(model.state_dict(), f'{path}/model_weights.pth')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
