In [2]:
import torch

### Loading the data 

In [3]:
# loading data 
def read_iob2_file(path):
    """
    read in iob2 file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

In [4]:
train_data = read_iob2_file("en_ewt-ud-train.iob2")
dev_data = read_iob2_file("en_ewt-ud-dev.iob2")
print(train_data[1])

(['Iguazu', 'Falls'], ['B-LOC', 'I-LOC'])


### Preprocessing the data 

#### Turning into list of strings and labels 

In [5]:
# formatting the data 
def list2sequence(data): 
    formated_data = [" ".join(sublist) for sublist, labels in data]
    #formated_data = [[" ".join(sublist), labels] for sublist, labels in data]
    formated_labels = [labels for sublist, labels in data]
    return formated_data, formated_labels
formatted_train_data, formated_train_labels= list2sequence(train_data)
print(formatted_train_data[0], formated_train_labels[0])

Where in the world is Iguazu ? ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']


#### Converting lables to indices 

In [6]:
# get the dictionary of incides for the labels 
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        # self.word2idx = {}
        # self.idx2word = []
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word(idx)

label_indices = Vocab()
for labels in formated_train_labels:
    for label in labels:
        label_indices.getIdx(label, add=True)
print(label_indices.word2idx)

{'<PAD>': 0, 'O': 1, 'B-LOC': 2, 'I-LOC': 3, 'B-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'I-PER': 7}


In [6]:
# # converting labels to indices
# label_map = label_indices.word2idx
# formated_train_labels_idx = [[label_map[label] for label in label_list] for label_list in formated_train_labels]
# formated_train_labels_idx[0]

In [7]:
# # making the labels into a tensor 
# from torch.nn.utils.rnn import pad_sequence

# # Example list of lists with varying lengths
# # Pad the sequences
# formated_train_labels_tensor = pad_sequence([torch.tensor(seq) for seq in formated_train_labels_idx], batch_first=True, padding_value=0)

# print(formated_train_labels_tensor.shape)

#### Tokenizing the data 

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tok_train_data = tokenizer(formatted_train_data, padding=True, truncation=True, return_tensors="pt", add_special_tokens=False)
encoded_train_data = tok_train_data.input_ids
print(encoded_train_data.shape)

torch.Size([12543, 188])


In [8]:
tokenizer  = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=False)
# Tokenize input text and map tokens to token IDs    
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

tokenized_sentences = []
labels_train_formated = []
for tuple in train_data: 
    tokenized_sentence, label = tokenize_and_preserve_labels(tuple[0], tuple[1])
    tokenized_sentences.append(tokenized_sentence)
    labels_train_formated.append(label)

print(tokenized_sentences[0]) 
print(labels_train_formated[0])
max_len = 0
for i in tokenized_sentences: 
    if len(i) > max_len: 
        max_len = len(i)
print(max_len)
    

['Where', 'in', 'the', 'world', 'is', 'I', '##gua', '##zu', '?']
['O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'B-LOC', 'O']
188


In [9]:
# changing labes into indices 
label_map = label_indices.word2idx
formated_train_labels_idx_tok = [[label_map[label] for label in label_list] for label_list in labels_train_formated]
formated_train_labels_idx_tok[0]

[1, 1, 1, 1, 1, 2, 2, 2, 1]

In [10]:
# making the labels into a tensor 
from torch.nn.utils.rnn import pad_sequence

# Example list of lists with varying lengths
# Pad the sequences
formated_train_labels_tensor = pad_sequence([torch.tensor(seq) for seq in formated_train_labels_idx_tok], batch_first=True, padding_value=0)

print(formated_train_labels_tensor.shape)

torch.Size([12543, 188])
tensor([1, 1, 1, 1, 1, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


#### Batching the data 

In [11]:
# checking shapes - dont know why they are not the same shapes 
print(formated_train_labels_tensor.shape)
print(encoded_train_data.shape)

torch.Size([12543, 188])
torch.Size([12543, 188])


In [16]:
def batcher(data, batch_size): 
    feats = data.shape[1]
    num_batches = int(len(data)/batch_size)
    # slices the number of instances that can fit in the batches, then reshapes them into tensors each of dimensions (num_batches, batch_size, 100) i.e. each layer 
    # represents a batch with dimensions (batch size), 100  
    data_batches = data[:batch_size*num_batches].view(num_batches,batch_size, feats)
    # 6 batches with 32 instances with 100 features
    return data_batches

BATCH_SIZE = 128

train_batches = batcher(encoded_train_data, BATCH_SIZE)
train_label_batches = batcher(formated_train_labels_tensor, BATCH_SIZE)
print(train_batches.shape)
print(train_label_batches.shape)
# shape of each batch 
print(train_batches[1].shape)
train_batches.shape[2]

torch.Size([97, 128, 188])
torch.Size([97, 128, 188])
torch.Size([128, 188])


188

#### Training the model (trying to at least)

CODE DOES NOT WORK YET !!!! need to figure out how to flatten the labels tensor and make the shapes match 190 vs 188 (dont know why)

In [1]:
from torch import nn
import torch
from transformers import BertModel
torch.manual_seed(0)
LEARNING_RATE = 0.1
EPOCHS = 4
n_labels = len(label_indices.word2idx)
max_len = train_batches.shape[2]


class NER_Tagger(torch.nn.Module):
    def __init__(self, n_labels):
        super().__init__()
        # TODO
        # bert includes a linear layer 
        #self.bert = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_labels)
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased', num_labels=n_labels)
        # self.dropout = nn.Dropout(0.1)
        self.hidden_size = self.bert.config.hidden_size
        self.linear = nn.Linear(self.hidden_size, n_labels)
        self.softmax = nn.Softmax(dim = 2)
        
    def forward(self, inputData):
        # TODO
        # bert model output 
        output_bert = self.bert(inputData)
        #logits = output_bert.logits
        logits = self.linear(output_bert.last_hidden_state)
        # get probabilities 
        probs = self.softmax(logits)
        return probs


    def predict(self, inputData): 
        prediction_output = self.forward(inputData) 
        prediction = torch.argmax(prediction_output, dim = 2)
        return prediction 



model = NER_Tagger(n_labels)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

In [18]:
for epoch in range(EPOCHS):
    # set model in training mode 
    print("hej1")
    model.train()
    # reset the gradient
    model.zero_grad()
    # loop over batches
    counter = 0
    loss = 0 
    print("hej2")
    for batch in range(train_batches.shape[0]): #TODO
        # tokenizer here for each batch  
        print("hej3")
        predicted_values = model.forward(train_batches[batch])
        print("hej4")
        flattened_output = predicted_values.view(BATCH_SIZE * max_len, -1)
        flattened_labels = train_label_batches[batch].view(BATCH_SIZE * max_len)
        # compute loss
        loss = loss_function(flattened_output, flattened_labels)
        print("hej5")
        predicted_labels = torch.argmax(predicted_values, dim = 2)
        predicted_labels = predicted_labels.view(BATCH_SIZE, max_len)
        
        # update
        optimizer.zero_grad()
        print("hej6")
        loss.backward()
        print("hej7")
        optimizer.step()
        model.zero_grad()
        counter +=1
        loss += loss.item()
        # TODO
        break
    break
        
# set to evaluation mode
model.eval()

hej
hej1




hej2


#### Trying his library

In [None]:
from transformers import AutoTokenizer
import myutils
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
train_tokked = myutils.tok(formatted_train_data, tokenizer)

[[101, 23525, 10106, 10105, 11356, 10124, 146, 20337, 13078, 136, 102],
 [101, 146, 20337, 13078, 23118, 102],
 [101,
  21660,
  10454,
  14289,
  10114,
  10347,
  10464,
  10108,
  10105,
  10992,
  32650,
  78125,
  24236,
  12286,
  38619,
  10106,
  10105,
  11356,
  117,
  10105,
  146,
  20337,
  13078,
  23118,
  10135,
  10105,
  20949,
  10108,
  12853,
  10111,
  16765,
  117,
  10301,
  169,
  88134,
  14982,
  12888,
  57642,
  10106,
  10105,
  11168,
  119,
  102],
 [101,
  10117,
  12672,
  10108,
  10105,
  35017,
  10124,
  11053,
  52472,
  10146,
  100,
  69699,
  62355,
  10213,
  15973,
  82111,
  100,
  113,
  10117,
  25410,
  100,
  187,
  51635,
  64825,
  114,
  117,
  10940,
  10105,
  42126,
  15901,
  10108,
  12286,
  104838,
  169,
  29152,
  12606,
  10123,
  10108,
  32650,
  23432,
  10708,
  10105,
  12566,
  119,
  102],
 [101,
  12613,
  10105,
  42230,
  57667,
  37158,
  10376,
  12935,
  10135,
  10105,
  32650,
  23432,
  117,
  29132,
  169,
 

In [None]:
BATCH_SIZE = 16
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
PAD = tokenizer.pad_token_id
train_data_batched, train_labels_batched = myutils.to_batch(train_tokked, formated_train_labels_idx, BATCH_SIZE, PAD, DEVICE)


TypeError: can't assign a list to a torch.LongTensor

In [None]:
from transformers import AutoTokenizer

tokenizer  = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=False)
# Tokenize input text and map tokens to token IDs    
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

train_data_formated = []
for t in train_data: 
    train_data_formated.append((tokenize_and_preserve_labels(t[0], t[1])))
print(train_data_formated[0])