## Preprocessing data

In [1]:
# loading data 
def read_iob2_file(path):
    """
    read in iob2 file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

In [2]:
train_data = read_iob2_file("en_ewt-ud-train.iob2")
dev_data = read_iob2_file("en_ewt-ud-dev.iob2")
print(train_data[1])

(['Iguazu', 'Falls'], ['B-LOC', 'I-LOC'])


In [3]:
# formatting the data 
def list2sequence(data): 
    formated_data = [[" ".join(sublist),labels] for sublist, labels in data]
    return formated_data
formatted_train_data = list2sequence(train_data)
print(formatted_train_data[0])

['Where in the world is Iguazu ?', ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']]


Tokenizing the sentences - output of tokenizer: 
* input_ids are the indices corresponding to each token in the sentence.
* attention_mask indicates whether a token should be attended to or not.
* token_type_ids identifies which sequence a token belongs to when there is more than one sequence

In [4]:
from transformers import AutoTokenizer

tokenizer  = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=False)
# Tokenize input text and map tokens to token IDs    
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

train_data_formated = []
for tuple_train in train_data: 
    train_data_formated.append((tokenize_and_preserve_labels(tuple_train[0], tuple_train[1])))
print(train_data_formated[0])

  from .autonotebook import tqdm as notebook_tqdm


(['Where', 'in', 'the', 'world', 'is', 'I', '##gua', '##zu', '?'], ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'O'])


In [5]:
from transformers import BertForTokenClassification
encoded_train = tokenizer(formatted_train_data[0:100], padding=True, truncation=True, return_tensors="pt")
BERT_model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=7)
output = BERT_model(**encoded_train)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
output.logits.shape

torch.Size([100, 97, 7])

In [10]:
from torch import nn
sm = nn.Softmax(dim=2)
prob = sm(output.logits)
prob.shape

torch.Size([100, 97, 7])

In [None]:
from transformers import BertModel
encoded_train_b = tokenizer(formatted_train_data[0:100], padding=True, truncation=True, return_tensors="pt")
BERT_model_b = BertModel.from_pretrained('bert-base-multilingual-cased', num_labels=7)
output_b = BERT_model_b(**encoded_train_b)

In [None]:
output_b.last_hidden_state

torch.Size([100, 97, 768])

In [None]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    sentences = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    tokenized_texts, _ = tokenize_text(sentences, tokenizer)

    encoded_labels = encode_labels(labels, label_map)

    padded_tokenized_texts = pad_sequences(tokenized_texts, max_length)
    padded_encoded_labels = pad_sequences(encoded_labels, max_length)

    return padded_tokenized_texts, padded_encoded_labels

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

for batch in train_dataloader:
  ...

In [None]:
from torch import nn
import torch
torch.manual_seed(0)
BATCH_SIZE = 16
LEARNING_RATE = 0.01
EPOCHS = 5
n_labels = 7 

class NER_Tagger(torch.nn.Module):
    def __init__(self, n_labels):
        super().__init__()
        # TODO
        # bert includes a linear layer 
        #self.bert = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_labels)
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased', num_labels=n_labels)
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, n_labels)
        self.softmax = nn.Softmax(dim = 2)
        
    def forward(self, inputData):
        # TODO
        # bert model output 
        output_bert = self.bert(inputData)
        #logits = output_bert.logits
        logits = self.linear(output_bert)
        # get probabilities 
        probs = self.softmax(logits)
        return probs


    def predict(self, inputData): 
        prediction_output = self.forward(inputData) 
        prediction = torch.argmax(prediction_output, dim = 2)
        return prediction 



model = NER_Tagger(n_labels)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')


for epoch in range(EPOCHS):
    # set model in training mode 
    model.train()
    # reset the gradient
    #model.zero_grad()
    # loop over batches
    counter = 0

    for batch in range(train_batches.shape[0]): #TODO
        predicted_values = model.forward(train_batches[batch])
        flattened_output = predicted_values.view(BATCH_SIZE * max_len, -1)
        flattened_labels = train_label_batches[batch].view(-1).long()
        # calculate loss (and print)
        # print(flattened_output.shape)
        # print(sample_label_batches[batch].shape)
        # print(flattened_labels)
        # print(flattened_labels.long())
        loss = loss_function(flattened_output, flattened_labels)
        #print(loss.item())
        # print some of the losses: 
        if counter % 100==0:
            print(f"loss: {loss:>7f}")
        # update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        counter +=1
        # TODO
        
# set to evaluation mode
model.eval()

NameError: name 'vocab' is not defined

In [None]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Example tokenized data
tokenized_data = [(['John', 'Doe', 'lives', 'in', 'Paris'], ['B-PER', 'I-PER', 'O', 'O', 'B-LOC'])]

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Convert tokenized data to features
input_ids = []
attention_masks = []
labels = []
for tokens, bio_tags in tokenized_data:
    # Convert tokens to token IDs
    encoded_dict = tokenizer.encode_plus(tokens, add_special_tokens=True, max_length=512, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

    # Convert BIO tags to label IDs
    label_map = {'B-PER': 0, 'I-PER': 1, 'O': 2, 'B-LOC':3, 'I-LOC':4, 'B-ORG':5, 'I-ORG': 6}  # Define your label mapping
    label_ids = [label_map[tag] for tag in bio_tags]
    labels.append(label_ids)

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Define the model
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)  # 3 labels: B, I, O

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Create DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=32)

# Training loop
epochs = 3
for _ in range(epochs):
    model.train()
    for batch in dataloader:
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Evaluation (optional)
model.eval()
# Evaluation code goes here

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Expected input batch_size (512) to match target batch_size (5).

In [None]:
from typing import List, Dict
import codecs
import torch
import sys
import myutils
from transformers import AutoModel, AutoTokenizer

# set seed for consistency
torch.manual_seed(8446)
# Set some constants
BERT = 'bert-base-multilingual-cased'
BATCH_SIZE = 8
LEARNING_RATE = 0.00001
EPOCHS = 3
# We have an UNK label for robustness purposes, it makes it easier to run on
# data with other labels, or without labels.
UNK = "[UNK]"
MAX_TRAIN_SENTS=64
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"


class ClassModel(torch.nn.Module):
    def __init__(self, nlabels: int, bert: str):
        """
        Model for classification with transformers.

        The architecture of this model is simple, we just have a transformer
        based language model, and add one linear layer to converts it output
        to our prediction.
    
        Parameters
        ----------
        nlabels : int
            Vocabulary size of output space (i.e. number of labels)
        bert : str
            Name of the transformers language model to use, can be found on:
            https://huggingface.co/models
        """
        super().__init__()

        # The transformer model to use
        self.bert = AutoModel.from_pretrained(bert)

        # Find the size of the output of the masked language model
        if hasattr(self.bert.config, 'hidden_size'):
            self.bert_out_size = self.bert.config.hidden_size
        elif hasattr(self.bert.config, 'dim'):
            self.bert_out_size = self.bert.config.dim
        else: # if not found, guess
            self.bert_out_size = 768

        # Create prediction layer
        self.hidden_to_label = torch.nn.Linear(self.bert_out_size, nlabels)

    def forward(self, input: torch.tensor):
        """
        Forward pass
    
        Parameters
        ----------
        input : torch.tensor
            Tensor with wordpiece indices. shape=(batch_size, max_sent_len).

        Returns
        -------
        output_scores : torch.tensor
            ?. shape=(?,?)
        """
        # Run transformer model on input
        bert_out = self.bert(input)

        # Keep only the last layer: shape=(batch_size, max_len, DIM_EMBEDDING)
        bert_out = bert_out.last_hidden_state
        # Keep only the output for the first ([CLS]) token: shape=(batch_size, DIM_EMBEDDING)
        bert_out = bert_out[:,:1,:].squeeze()

        # Matrix multiply to get scores for each label: shape=(?,?)
        output_scores = self.hidden_to_label(bert_out)

        return output_scores

    def run_eval(self, text_batched: List[torch.tensor], labels_batched: List[torch.tensor]):
        """
        Run evaluation: predict and score
    
        Parameters
        ----------
        text_batched : List[torch.tensor]
            list with batches of text, containing wordpiece indices.
        labels_batched : List[torch.tensor]
            list with batches of labels (converted to ints).
        model : torch.nn.module
            The model to use for prediction.
    
        Returns
        -------
        score : float
            accuracy of model on labels_batches given feats_batches
        """
        self.eval()
        match = 0
        total = 0
        for sents, labels in zip(text_batched, labels_batched):
            output_scores = self.forward(sents)
            pred_labels = torch.argmax(output_scores, 1)
            for gold_label, pred_label in zip(labels, pred_labels):
                total += 1
                if gold_label.item() == pred_label.item():
                    match+= 1
        return(match/total)  