In [2]:
import torch

### Loading the data + preprocessing

In [3]:
# loading data 
def read_iob2_file(path):
    """
    read in iob2 file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

In [4]:
train_data = read_iob2_file("en_ewt-ud-train.iob2")
dev_data = read_iob2_file("en_ewt-ud-dev.iob2")
print(train_data[1])

(['Iguazu', 'Falls'], ['B-LOC', 'I-LOC'])


Train data 

In [5]:
# formatting the data 
def list2sequence(data): 
    formated_data = [" ".join(sublist) for sublist, labels in data]
    #formated_data = [[" ".join(sublist), labels] for sublist, labels in data]
    formated_labels = [labels for sublist, labels in data]
    return formated_data, formated_labels
formatted_train_data, formated_train_labels= list2sequence(train_data)
print(formatted_train_data[0], formated_train_labels[0])

Where in the world is Iguazu ? ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']


Dev data 

In [6]:
formatted_dev_data, formated_dev_labels= list2sequence(dev_data)
print(formatted_dev_data[0], formated_dev_labels[0])

where can I get morcillas in tampa bay , I will like the argentinian type , but I will to try anothers please ? ['O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


#### Converting lables to indices 

In [7]:
# get the dictionary of incides for the labels 
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        # self.word2idx = {}
        # self.idx2word = []
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word(idx)

label_indices = Vocab()
for labels in formated_train_labels:
    for label in labels:
        label_indices.getIdx(label, add=True)
print(label_indices.word2idx)

{'<PAD>': 0, 'O': 1, 'B-LOC': 2, 'I-LOC': 3, 'B-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'I-PER': 7}


In [8]:
print(list(label_indices.word2idx.keys())[list(label_indices.word2idx.values()).index(7)])

I-PER


In [9]:
# # converting labels to indices
# label_map = label_indices.word2idx
# formated_train_labels_idx = [[label_map[label] for label in label_list] for label_list in formated_train_labels]
# formated_train_labels_idx[0]

In [10]:
# # making the labels into a tensor 
# from torch.nn.utils.rnn import pad_sequence

# # Example list of lists with varying lengths
# # Pad the sequences
# formated_train_labels_tensor = pad_sequence([torch.tensor(seq) for seq in formated_train_labels_idx], batch_first=True, padding_value=0)

# print(formated_train_labels_tensor.shape)

#### Tokenizing the train data 

In [11]:
#BERT = 'bert-base-multilingual-cased'
BERT = 'distilbert-base-multilingual-cased'

In [12]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(BERT)
tok_train_data = tokenizer(formatted_train_data, padding=True, truncation=True, return_tensors="pt", add_special_tokens=False)
encoded_train_data = tok_train_data.input_ids
print(encoded_train_data) 
# getting the tokens from the ids 
id = encoded_train_data[0]
tokens = tokenizer.convert_ids_to_tokens(id)
print(tokens)

tensor([[23525, 10106, 10105,  ...,     0,     0,     0],
        [  146, 20337, 13078,  ...,     0,     0,     0],
        [21660, 10454, 14289,  ...,     0,     0,     0],
        ...,
        [10117, 10399, 10525,  ...,     0,     0,     0],
        [21200, 11131,   117,  ...,     0,     0,     0],
        [  146,   112,   181,  ...,     0,     0,     0]])
['Where', 'in', 'the', 'world', 'is', 'I', '##gua', '##zu', '?', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]

In [13]:
attention_mask = tok_train_data.attention_mask 

In [14]:
tokenizer  = AutoTokenizer.from_pretrained(BERT, use_fast=False)
# Tokenize input text and map tokens to token IDs    
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

tokenized_sentences = []
labels_train_formated = []
for tuple in train_data: 
    tokenized_sentence, label = tokenize_and_preserve_labels(tuple[0], tuple[1])
    tokenized_sentences.append(tokenized_sentence)
    labels_train_formated.append(label)

print(tokenized_sentences[0]) 
print(labels_train_formated[0])
max_len = 0
for i in tokenized_sentences: 
    if len(i) > max_len: 
        max_len = len(i)
print(max_len)
    

['Where', 'in', 'the', 'world', 'is', 'I', '##gua', '##zu', '?']
['O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'O']
188


In [15]:
# changing labes into indices 
label_map = label_indices.word2idx
formated_train_labels_idx_tok = [[label_map[label] for label in label_list] for label_list in labels_train_formated]
formated_train_labels_idx_tok[0]

[1, 1, 1, 1, 1, 2, 2, 2, 1]

In [16]:
# making the labels into a tensor 
from torch.nn.utils.rnn import pad_sequence

# Example list of lists with varying lengths
# Pad the sequences
formated_train_labels_tensor = pad_sequence([torch.tensor(seq) for seq in formated_train_labels_idx_tok], batch_first=True, padding_value=0)

print(formated_train_labels_tensor.shape)
print(formated_train_labels_tensor[0])

torch.Size([12543, 188])
tensor([1, 1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


#### Tokenizing dev data 

In [17]:
tok_dev_data = tokenizer(formatted_train_data, padding=True, truncation=True, return_tensors="pt", add_special_tokens=False)
encoded_dev_data = tok_dev_data.input_ids
attention_mask_dev = tok_dev_data.attention_mask 

In [18]:
tokenized_sentences_dev = []
labels_dev_formated = []
for tuple in dev_data: 
    tokenized_sentence_dev, label_dev = tokenize_and_preserve_labels(tuple[0], tuple[1])
    tokenized_sentences_dev.append(tokenized_sentence_dev)
    labels_dev_formated.append(label_dev)
print(tokenized_sentences_dev[0])
print(labels_dev_formated[0])

['where', 'can', 'I', 'get', 'mor', '##cilla', '##s', 'in', 'tam', '##pa', 'bay', ',', 'I', 'will', 'like', 'the', 'argent', '##inia', '##n', 'type', ',', 'but', 'I', 'will', 'to', 'try', 'another', '##s', 'pl', '##eas', '##e', '?']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [19]:
# changing labes into indices 
label_map = label_indices.word2idx
formated_dev_labels_idx_tok = [[label_map[label] for label in label_list] for label_list in labels_dev_formated]

In [20]:
# Pad the sequences
formated_dev_labels_tensor = pad_sequence([torch.tensor(seq) for seq in formated_dev_labels_idx_tok], batch_first=True, padding_value=0)

print(formated_dev_labels_tensor.shape)
print(formated_dev_labels_tensor[0])

torch.Size([2001, 96])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


#### Batching the train data 

In [21]:
BATCH_SIZE = 128 
# checking shapes - dont know why they are not the same shapes 
print(formated_train_labels_tensor.shape)
print(encoded_train_data.shape)
print(attention_mask.shape)

torch.Size([12543, 188])
torch.Size([12543, 188])
torch.Size([12543, 188])


In [22]:
def batcher(data, batch_size): 
    feats = data.shape[1]
    num_batches = int(len(data)/batch_size)
    # slices the number of instances that can fit in the batches, then reshapes them into tensors each of dimensions (num_batches, batch_size, 100) i.e. each layer 
    # represents a batch with dimensions (batch size), 100  
    data_batches = data[:batch_size*num_batches].view(num_batches,batch_size, feats)
    # 6 batches with 32 instances with 100 features
    return data_batches

train_batches = batcher(encoded_train_data, BATCH_SIZE)
train_label_batches = batcher(formated_train_labels_tensor, BATCH_SIZE)
train_attention_masks = batcher(attention_mask, BATCH_SIZE)
print(train_batches.shape)
print(train_label_batches.shape)
print(train_attention_masks.shape)
# shape of each batch 
print(train_batches[1].shape)

torch.Size([97, 128, 188])
torch.Size([97, 128, 188])
torch.Size([97, 128, 188])
torch.Size([128, 188])


#### Batching the dev data 

In [23]:
dev_batches = batcher(encoded_dev_data, BATCH_SIZE)
dev_label_batches = batcher(formated_dev_labels_tensor, BATCH_SIZE)
dev_attention_masks_batch = batcher(attention_mask_dev, BATCH_SIZE)

#### Checking class frequencies 

In [24]:
labels_train_formated
freq = {}
for label_list in labels_train_formated: 
    for label in label_list:
        freq[label] = freq.get(label, 0) + 1
print(freq)
total_n = sum(freq.values())
n_samples = len(freq.keys())
weights = {}
for l in freq.keys(): 
    weights[l] = total_n / freq[l]
    #weights[l]=total_n / (freq[l]*n_samples)

#  {PAD: 0, 'O': 1, 'B-LOC': 2, 'I-LOC': 3, 'B-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'I-PER': 7}
class_weights = torch.tensor([0.0001, 0.15207336038684907,9.859821182401827, 33.17399946991784, 8.915874203084375, 14.379401459015451, 23.6205887903378,14.57276749330539  ])

{'O': 235160, 'B-LOC': 3627, 'I-LOC': 1078, 'B-PER': 4011, 'B-ORG': 2487, 'I-ORG': 1514, 'I-PER': 2454}


#### Training the model (trying to at least)

CODE DOES NOT WORK YET !!!! need to figure out how to flatten the labels tensor and make the shapes match 190 vs 188 (dont know why)

In [25]:
from torch import nn
import torch
from transformers import BertModel, AutoModel
torch.manual_seed(0)
BATCH_SIZE = 128
#BERT = 'bert-base-multilingual-cased'
BERT = 'distilbert-base-multilingual-cased'
LEARNING_RATE = 0.01
EPOCHS = 4
n_labels = len(label_indices.word2idx)
#n_labels = 7
max_len = train_batches.shape[2]
PAD = tokenizer.pad_token_id


class NER_Tagger(torch.nn.Module):
    def __init__(self, n_labels):
        super().__init__()
        # TODO
        # bert includes a linear layer 
        #self.bert = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_labels)
        self.bert = AutoModel.from_pretrained(BERT, num_labels=n_labels)
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, n_labels)
        #self.softmax = nn.Softmax(dim = 2)
        
    def forward(self, inputData, attention):
        # TODO
        # bert model output 
        output_bert = self.bert(input_ids = inputData, attention_mask = attention)
        #logits = output_bert.logits
        logits = self.linear(output_bert.last_hidden_state)
        # get probabilities 
        #probs = self.softmax(logits)
        return logits


    def predict(self, inputData, attention): 
        prediction_output = self.forward(inputData, attention) 
        prediction = torch.argmax(prediction_output, dim = 2)
        # ner_label = list(label_map.keys())[list(label_map.values()).index(prediction)]
        return prediction

In [26]:
model = NER_Tagger(n_labels)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
accuracy = []

for epoch in range(EPOCHS):
    # set model in training mode 
    model.train()
    # reset the gradient
    model.zero_grad()
    # loop over batches
    counter = 0
    loss = 0 
    total = 0
    match = 0

    for batch in range(5): #TODO
    #for batch in range(train_batches.shape[0]): #TODO

        # tokenizer here for each batch  
        predicted_values = model.forward(train_batches[batch], train_attention_masks[batch])

        print(predicted_values.shape)
        flattened_output = predicted_values.view(BATCH_SIZE * max_len, -1)
        #flattened_labels = train_label_batches[batch].view(-1).long()
        flattened_labels = train_label_batches[batch].view(BATCH_SIZE * max_len)
        # compute loss
        loss = loss_function(flattened_output, flattened_labels)
        #print(loss.item())
        predicted_labels = torch.argmax(predicted_values, dim = 2)
        #print(predicted_labels.shape)
        #print(predicted_labels)
        predicted_labels = predicted_labels.view(BATCH_SIZE, max_len)
        
        # update
        model.zero_grad()
        loss.backward()
        optimizer.step()
        # optimizer.zero_grad()
        counter +=1
        loss += loss.item()
        # printing the tokens and the 
        # for sentence in range(BATCH_SIZE): 
        #     ids = train_batches[batch][sentence]
        #     tokens = tokenizer.convert_ids_to_tokens(ids)
        #     preds = predicted_labels[sentence]
        #     print(tokens, preds)
        print(predicted_labels)
        print(train_label_batches[batch])
        
        for gold_sent, pred_sent in zip(train_label_batches[batch], predicted_labels):
            for gold_label, pred_label in zip(gold_sent, pred_sent):
                if gold_label != 0:
                    total += 1
                    if gold_label == pred_label:
                        match+= 1
    acc = match/total
    accuracy.append(acc)

accuracy_val = sum(accuracy) / len(accuracy)
print("total accuracy", accuracy_val)
    #print(loss)

torch.Size([128, 188, 8])
tensor([[4, 4, 4,  ..., 4, 4, 4],
        [7, 6, 4,  ..., 4, 4, 4],
        [3, 4, 4,  ..., 4, 4, 4],
        ...,
        [0, 4, 4,  ..., 4, 4, 4],
        [4, 4, 5,  ..., 4, 4, 4],
        [5, 5, 6,  ..., 4, 4, 4]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [2, 2, 2,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
torch.Size([128, 188, 8])
tensor([[0, 1, 0,  ..., 0, 0, 0],
        [4, 0, 0,  ..., 0, 0, 0],
        [0, 4, 0,  ..., 0, 0, 0],
        ...,
        [4, 0, 0,  ..., 0, 0, 0],
        [6, 1, 1,  ..., 0, 1, 1],
        [6, 6, 6,  ..., 7, 0, 6]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
torch.Size([128, 188, 8])
tensor([[0, 0, 0,  ..., 4, 0, 0],
        [4, 4, 0

#### Evaluation

In [None]:
print(dev_batches.shape)
attention_mask_dev.shape

torch.Size([391, 32, 188])


torch.Size([12543, 188])

In [None]:
model.eval()

total_evalv = 0
match_evalv = 0
accuracy_evalv = []
for batch in range(2):
    batch_dev_prediction = model.forward(dev_batches[batch], dev_attention_masks_batch[batch])

    predicted_labels_dev = torch.argmax(batch_dev_prediction, dim = 2)

    for gold_sent, pred_sent in zip(dev_label_batches[batch], predicted_labels_dev):
        for gold_label, pred_label in zip(gold_sent, pred_sent):
            if gold_label != 0:
                total_evalv += 1
                if gold_label == pred_label:
                    match_evalv += 1
                    
    acc_evalv = match/total
    accuracy_evalv.append(acc_evalv)

    # printing the tokens and the 
    for sentence in range(BATCH_SIZE): 
        ids = dev_batches[batch][sentence]
        tokens = tokenizer.convert_ids_to_tokens(ids)
        preds = predicted_labels_dev[sentence]
        print(tokens, preds)

accuracy_val_evalv = sum(accuracy_evalv) / len(accuracy_evalv)
print("accuracy", accuracy_val_evalv)

['Where', 'in', 'the', 'world', 'is', 'I', '##gua', '##zu', '?', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

#### Trying his library

In [None]:
from transformers import AutoTokenizer
import myutils
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
train_tokked = myutils.tok(formatted_train_data, tokenizer)