In [36]:
import torch
import numpy as np
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForTokenClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

In [37]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [38]:
LABELS = [ "O", "b-mount", "i-mount"]
num_labels_ = len(LABELS)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=num_labels_)


In [39]:
tokenizer.tokenize("Yayko-Perehinske")

['ya', '##yk', '##o', '-', 'pere', '##hin', '##ske']

In [40]:
sentence = "Everest is the highest mountain in the world."
sentence_tokenized = tokenizer.tokenize(sentence)
text_labels = ["b-mount", "O", "O", "O", "O", "O", "O", "O", "O"]
len(sentence_tokenized) == len(text_labels)

True

In [41]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)
    if len(tokenized_sentence) == len(labels):
        return tokenized_sentence, labels
    
a = tokenize_and_preserve_labels(sentence.split(), text_labels, tokenizer)
print(a)

(['everest', 'is', 'the', 'highest', 'mountain', 'in', 'the', 'world', '.'], ['b-mount', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])


In [42]:
def create_labels_dict(file):
    names_labels = {} 
    with open(file, "r", encoding="utf-8") as fd:
        res = fd.readlines()
        res = [i.strip() for i in res]
        res = [i.removesuffix(" \n") for i in res]
        res = [i.removesuffix("\n") for i in res]
    for name in res:
        if len(name.split(" ")) > 1:
            subnames = name.split(" ")
            first_name = {subnames[0].lower(): "b-mount"}
            names_labels.update(first_name)
            second_name = {subnames[i].lower(): "i-mount" for i in range(1, len(subnames[1:]) + 1)}
            names_labels.update(second_name)                        
        elif len(name.split("-")) > 1:
            subnames = name.split("-")
            names_labels.update({subnames[0].lower(): "b-mount"})
            names_labels.update({subnames[i].lower(): "i-mount" for i in range(1, len(subnames[1:]) + 1)})
        else:
            names_labels.update({name.lower(): "b-mount"})
    return names_labels
            
labels_dict = create_labels_dict("ua_mountains.txt")
labels_dict

{'hoverla': 'b-mount',
 'brebeneskul': 'b-mount',
 'pip': 'b-mount',
 'ivan': 'i-mount',
 'petros': 'b-mount',
 'hutyn': 'b-mount',
 'tomnatyk': 'i-mount',
 'rebra': 'b-mount',
 'menchul': 'b-mount',
 'turkul': 'b-mount',
 'breskul': 'b-mount',
 'smotrych': 'b-mount',
 'blyznytsya': 'b-mount',
 'dzembronia': 'b-mount',
 'shpytsi': 'b-mount',
 'petrosul': 'b-mount',
 'dantsir': 'b-mount',
 'pozhyzhevska': 'b-mount',
 'neniska': 'b-mount',
 'velyka': 'i-mount',
 'syvulya': 'b-mount',
 'ihrovets': 'b-mount',
 'zherban': 'b-mount',
 'bratkivska': 'b-mount',
 'homul': 'b-mount',
 'shuryn': 'b-mount',
 'velyky': 'b-mount',
 'kotel': 'i-mount',
 'chyvchyn': 'b-mount',
 'dohyaska': 'b-mount',
 'hropa': 'b-mount',
 'dragobrat': 'b-mount',
 'dovbushanka': 'b-mount',
 'grofa': 'b-mount',
 'popadya': 'b-mount',
 'parenky': 'b-mount',
 'koman': 'b-mount',
 'moloda': 'b-mount',
 'strymba': 'b-mount',
 'chorna': 'b-mount',
 'kleva': 'i-mount',
 'tataruka': 'b-mount',
 'durna': 'b-mount',
 'unharyaska

In [43]:
def split_punct(text, punct):
    result = []
    for i in list(text):
        r = i.strip().split(punct)
        if len(r) > 1:
            for j in range(len(r)):
                if j < len(r)-1:
                    result.append(r[j] + " " + punct)
                else:
                    if r[j]: # last part without "punct", if it have "punct" in the end than last part will be ""
                        result.append(r[j]) 
        else:
            result.append(i)
            
    return result



In [44]:
file = "ua_text.txt"
with open(file, "r", encoding="utf-8") as fd:
    res = []
    while True:
        line = fd.readline()
        if not line:
            break
        res.append(line.replace("\n", "")) # yield to generator
        
res = split_punct(res, "!")
res = split_punct(res, "?")
res = split_punct(res, ".")
print(len(res))
res[-1]

59


' Together, Tempa, Pidpula, Yayko-Perehinske, Baba-Lyudova, Kernychny, Gorgan-Ilemsky, Skupova, Yarovytsya, Berlyaska, Tarnavytsya, and Roztitska form a captivating ensemble, inviting explorers to immerse themselves in the diverse wonders of the Ukrainian Carpathians .'

In [45]:
data = []
for sentence in res:
    sentence_tokenized = tokenizer.tokenize(sentence.strip())
    splited_sentense = []
    word = []
    for i in range(len(sentence_tokenized) - 1):
        if i != 0:
            if sentence_tokenized[i].startswith("##"):
                if not word:
                    word.append(splited_sentense.pop())
                    word.append(sentence_tokenized[i].removeprefix("##"))
                else:
                    word.append(sentence_tokenized[i].removeprefix("##"))
            else:
                if word:
                    splited_sentense.append("".join(word))
                    word = []
                    splited_sentense.append(sentence_tokenized[i])
                else:
                    splited_sentense.append(sentence_tokenized[i])
        else:
            splited_sentense.append(sentence_tokenized[0])
    
    masked_layer = []
    for word in splited_sentense:
        if word in labels_dict:
            masked_layer.append(labels_dict[word])
        else:
            masked_layer.append("O")
    data.append(tokenize_and_preserve_labels(splited_sentense, masked_layer, tokenizer))
    
print(len(data))

59


In [46]:
label2id = {k: v for v, k in enumerate(LABELS)}
id2label = {v: k for v, k in enumerate(LABELS)}
label2id

{'O': 0, 'b-mount': 1, 'i-mount': 2}

In [47]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [48]:
X = [data[i][0] for i in range(len(data))]
Y = [data[i][1] for i in range(len(data))]


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)

In [50]:
class dataset(Dataset):
    def __init__(self, X, Y, tokenizer, max_len):
        self.len = len(X)
        self.X = X
        self.Y = Y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        tokenized_sentence = self.X[index]
        word_labels = self.Y[index]

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        word_labels = ["O"] + word_labels + ["O"] # add special tokens


        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          word_labels = word_labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          word_labels = word_labels + ["O" for _ in range(maxlen - len(word_labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in word_labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [51]:
training_set = dataset(X_train, y_train, tokenizer, MAX_LEN)
testing_set = dataset(X_test, y_test, tokenizer, MAX_LEN)

In [52]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [53]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased',
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [56]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(1.1710, grad_fn=<NllLossBackward0>)

In [57]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 3])

In [58]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [59]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [60]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 1.1465680599212646
Training loss epoch: 0.6939194947481155
Training accuracy epoch: 0.6871337873990995
Training epoch: 2
Training loss per 100 training steps: 0.21100948750972748
Training loss epoch: 0.16590232650438944
Training accuracy epoch: 0.8720928151035992
Training epoch: 3
Training loss per 100 training steps: 0.09916052222251892
Training loss epoch: 0.10192977078258991
Training accuracy epoch: 0.8746521318636346
Training epoch: 4
Training loss per 100 training steps: 0.08047659695148468
Training loss epoch: 0.06350787822157145
Training accuracy epoch: 0.9374637863454932
Training epoch: 5
Training loss per 100 training steps: 0.0531141422688961
Training loss epoch: 0.046093177516013384
Training accuracy epoch: 0.9569449483395362


In [61]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [62]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.02800721302628517
Validation Loss: 0.02808769481877486
Validation Accuracy: 0.9861345720720721


In [63]:
sentence = " The highest mountain in Ukraine is Hoverla."

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

the highest mountain in ukraine is hoverla .
['O', 'O', 'O', 'O', 'O', 'O', 'b-mount', 'b-mount', 'b-mount', 'O']


In [64]:
torch.save(model, "saved_model.save")


In [66]:
model = torch.load("saved_model.save")
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [67]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.017578210681676865
Validation Loss: 0.028087694508334
Validation Accuracy: 0.987349836184929
