In [2]:
import random
import re

import pandas as pd
import torch

from collections import defaultdict

from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from tqdm import trange, tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ENTITIES = ['PERSON',
 'PROFESSION',
 'ORGANIZATION',
 'EVENT',
 'DATE',
 'COUNTRY',
 'CITY',
 'NUMBER',
 'AGE',
 'ORDINAL',
 'NATIONALITY',
 'FACILITY',
 'STATE_OR_PROVINCE',
 'LAW',
 'AWARD',
 'LOCATION',
 'IDEOLOGY',
 'WORK_OF_ART',
 'PRODUCT',
 'CRIME',
 'DISEASE',
 'TIME',
 'MONEY',
 'DISTRICT',
 'PENALTY',
 'RELIGION',
 'PERCENT',
 'LANGUAGE',
 'FAMILY']

In [4]:
train_json = pd.read_json("../data/public_dat/train.jsonl", lines=True)
train_json

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4
...,...,...,...
514,"[[42, 46, COUNTRY], [82, 87, COUNTRY], [104, 1...",Глава Малайзии: мы не хотим противостоять Кита...,514
515,"[[1, 4, PRODUCT], [31, 33, FACILITY], [35, 44,...",«Союз» впервые пристыковался к МКС за 6 часов\...,515
516,"[[0, 4, PERSON], [8, 12, PERSON], [45, 52, AGE...",Трамп и Путин сделали совместное заявление к 7...,516
517,"[[0, 9, NATIONALITY], [58, 72, PERSON], [101, ...",Российский магнат устроил самую дорогую свадьб...,517


In [5]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
print(tokenizer.encode("что то Тут написал [PAD]", add_special_tokens=True, return_tensors='pt'))
print(tokenizer.decode([101, 1997, 3815, 29955, 12715,0, 102]))
print(tokenizer.decode([1997, 3815, 29955, 12715]))

tensor([[  101,  1997,  3815, 29955, 12715,     0,   102]])
[CLS] что то Тут написал [PAD] [SEP]
что то Тут написал


In [6]:
def _split_long_seq(
    input_ids,
    labels,
    max_length: int,
    cls_id: int = 101,
    sep_id: int = 102,
    pad_id: int = 0,
) -> tuple[torch.Tensor, list[str]]:
    labels = labels[1:-1]
    input_ids = input_ids[:, 1:-1]  # remove special tokens from beginning and end
    new_input_ids = []
    new_labels = []
    for i in range(0, input_ids.shape[1], max_length - 2):
        ids = input_ids[:, i : min(i + max_length - 2, input_ids.shape[1])]
        c_ids = torch.zeros((ids.shape[0], max_length), dtype=torch.long).type(torch.LongTensor)
        c_ids[:, 0] = cls_id
        c_ids[:, 1:1 + ids.shape[1]]
        c_ids[:, 1 + ids.shape[1]] = sep_id
        c_ids[:, 2 + ids.shape[1]: ] = pad_id
 
        c_labels = (
            ["[CLS]"]
            + labels[i : min(i + max_length - 2, input_ids.shape[0])]
            + ["[SEP]"]
        )
        new_input_ids.append(c_ids)
        new_labels.append(c_labels)

    return new_input_ids, new_labels

In [30]:

from torch import nn
import torch.utils
import torch.utils.data


def _get_unique_entities_seq(entities: list[str], labels: list[str]) -> dict:
    was = set()
    for l in labels:
        if l in entities:
            was.add(l)
    
    new_labels = {}
    for ent in was:
        c_seq = []
        for i in range(len(labels)):
            if labels[i] == ent:
                c_seq.append(1)
            else:
                c_seq.append(0)
        new_labels[ent] = c_seq
    return new_labels

class NestedNERDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame, entities: list[str]):
        self.df = df
        self.entities = entities
        self.tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
        self.bert_model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased")
        self.tokenized_ds = []
        self.tokenized_per_entity = []
        self._tokenize_per_class()
        self._get_word_embds()

    def _tokenize_per_class(self):
        # for ent in self.entities:
        #     self.tokenized_per_entity.append([])

        for _, data in tqdm(self.df.iterrows(), total=self.df.shape[0]):
            sentence = data["sentences"]
            ners = data["ners"]
            tokenized_sentence = self.tokenizer.tokenize(sentence)
            # print("\n")
            # print(tokenized_sentence)
            tokenized_length = len(tokenized_sentence)

            tokens = []
            labels = []

            for _ in range(tokenized_length):
                labels.append("O")  # "O" represents tokens outside entities

            for start, end, label in ners:
                # Convert character-level annotations to token-level
                start_token = self.tokenizer.encode(
                    sentence[:start], add_special_tokens=False
                )
                end_token = self.tokenizer.encode(
                    sentence[: end + 1], add_special_tokens=False
                )
                # print(sentence[start:end+1], ":", len(start_token), len(end_token), "-->", len(labels))
                # Update labels for tokens within the entity span
                for i in range(len(start_token), len(end_token)):
                    labels[i] = label

            labels = ["[CLS]"] + labels + ["[SEP]"]
            input_ids = self.tokenizer.encode(
                tokenized_sentence,
                add_special_tokens=True,
                return_tensors="pt",
                padding="max_length",
                max_length=512,
            )
            if input_ids.shape[1] > 512:
                # print("WAS", input_ids.shape)
                input_ids, labels = _split_long_seq(input_ids, labels, 512)
                for i in range(len(labels)):
                    # print(input_ids[i].shape)
                    self.tokenized_ds.append(
                        {
                            "input_ids": input_ids[i],
                            "attention_mask": torch.ones_like(input_ids[i]),
                            "labels": labels[i]
                        }
                    )
            else:
                attention_mask = torch.ones_like(input_ids)
                self.tokenized_ds.append(
                    {
                        "input_ids": input_ids,
                        "attention_mask": attention_mask,
                        "labels": labels,
                    }
                )

    def _get_word_embds(self, batch_size: int = 4):
        # all_tokens = pass
        all_tokens = [x["input_ids"] for x in self.tokenized_ds]
        attention_masks = [x["attention_mask"] for x in self.tokenized_ds]
        all_labels = [x["labels"] for x in self.tokenized_ds]
        
        ds = {x:([], []) for x in self.entities}
        
        with torch.no_grad():
            for i in trange(0, len(all_tokens), batch_size):
                # print([x.shape for x in all_tokens[i : min(i + batch_size, len(all_tokens))]])
                end_idx = min(i + batch_size, len(all_tokens))
                input_ids = torch.cat(
                    all_tokens[i :end_idx]
                )
                att_mask = torch.cat(
                    attention_masks[i : end_idx]
                )
                labels = all_labels[i : end_idx]
                outputs = self.bert_model(input_ids, attention_mask=att_mask)
                word_embeddings = outputs.last_hidden_state
                
                word_embeddings = word_embeddings[:, 1:-1]
    
                for j in range(len(labels)):
                    seqs = _get_unique_entities_seq(self.entities, labels[j][1:-1])
                    for ent_name in seqs:
                        # sequence labels
                        for k in range(len(seqs[ent_name])):
                            # ds[ent_name][1].append(seqs[ent_name][k])
                            if seqs[ent_name][k] == 0:
                                ds[ent_name][0].append(word_embeddings[j][k])
                            else:
                                ds[ent_name][1].append(word_embeddings[j][k])

        self.per_entity = {}
        for ent in ds:
            negative_words, positive_samples = ds[ent]
            if len(negative_words) == 0 or len(positive_samples) == 0:
                print(ent, "does have problems")
                continue
            
            negative_words = torch.stack(negative_words)
            positive_samples = torch.stack(positive_samples)
            negative_words = negative_words[torch.randperm(negative_words.shape[0])]
            
            negative_words = negative_words[:positive_samples.shape[0]]
            
            c_labels = torch.cat([torch.ones(positive_samples.shape[0]), torch.zeros(positive_samples.shape[0])])
            c_word_embds = torch.cat([positive_samples, negative_words])
            self.per_entity[ent] = torch.utils.data.TensorDataset(c_word_embds, c_labels)

In [31]:
ds = NestedNERDataset(train_json, ENTITIES)
# ds.tokenized_ds[0]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 519/519 [01:14<00:00,  7.01it/s]
100%|██████████| 135/135 [05:00<00:00,  2.23s/it]


In [33]:
class SimpleModel(nn.Module):
    def __init__(self, emb_dim: int, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.cls = nn.Sequential(
            nn.Linear(emb_dim, 100),
            nn.ReLU(),
            nn.Linear(100, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.cls.forward(x)

In [40]:
from sklearn.metrics import classification_report

def train_one_epoch(model: nn.Module, optimizer: torch.optim.Optimizer, loss, train_loader):
    model.train()
    avg_loss = 0
    cnt = 0
    bar = tqdm(train_loader)
    for x, label in bar:
        optimizer.zero_grad()
        out = model(x)
        l = loss(out.flatten(), label.flatten())
        l.backward()
        avg_loss = (avg_loss * cnt + l.cpu().item()) / (cnt + 1)
        cnt += 1
        bar.set_postfix({"avg. loss": avg_loss})
        optimizer.step()
        
def test_epoch(model: nn.Module, loss, loader):
    model.eval()
    avg_loss = 0
    cnt = 0
    bar = tqdm(loader)
    gt = []
    pred = []
    for x, label in bar:
        out = model(x)
        l = loss(out.flatten(), label.flatten())
        avg_loss = (avg_loss * cnt + l.cpu().item()) / (cnt + 1)
        cnt += 1
        bar.set_postfix({"avg. loss": avg_loss})
        
        c_pred = out > 0.5
        pred = pred + c_pred.cpu().detach().tolist()
        gt = gt + label.cpu().detach().tolist()
    
    print(classification_report(gt, pred))

def train(model: nn.Module, train_loader, test_loader, n_iter: int = 10):
    optimizer = torch.optim.Adam(model.parameters())
    loss = nn.BCELoss()
    for i in range(n_iter):
        print(f"Epoch {i}:")
        train_one_epoch(model, optimizer, loss, train_loader)
    print("On test:")
    test_epoch(model, loss, test_loader)
    

In [43]:
models = {}

for ent in ds.per_entity:
    print(f"Train {ent}")
    c_m = SimpleModel(768)
    train_ds, test_ds = torch.utils.data.random_split(ds.per_entity[ent], [0.8, 0.2])
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=20)
    test_loader = torch.utils.data.DataLoader(train_ds, batch_size=20)

    train(c_m, train_loader, test_loader, 3)
    models[ent] = c_m

Train PERSON
Epoch 0:


100%|██████████| 971/971 [00:02<00:00, 438.44it/s, avg. loss=0.0694]


Epoch 1:


100%|██████████| 971/971 [00:02<00:00, 480.69it/s, avg. loss=0.0344]


Epoch 2:


100%|██████████| 971/971 [00:02<00:00, 483.40it/s, avg. loss=0.0227]


On test:


100%|██████████| 971/971 [00:01<00:00, 716.32it/s, avg. loss=0.0138]


              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      9705
         1.0       0.99      1.00      1.00      9702

    accuracy                           1.00     19407
   macro avg       1.00      1.00      1.00     19407
weighted avg       1.00      1.00      1.00     19407

Train PROFESSION
Epoch 0:


100%|██████████| 548/548 [00:01<00:00, 477.70it/s, avg. loss=0.245]


Epoch 1:


100%|██████████| 548/548 [00:01<00:00, 370.01it/s, avg. loss=0.166]


Epoch 2:


100%|██████████| 548/548 [00:01<00:00, 393.38it/s, avg. loss=0.125]


On test:


100%|██████████| 548/548 [00:00<00:00, 846.72it/s, avg. loss=0.091] 


              precision    recall  f1-score   support

         0.0       0.96      0.97      0.97      5502
         1.0       0.97      0.96      0.97      5455

    accuracy                           0.97     10957
   macro avg       0.97      0.97      0.97     10957
weighted avg       0.97      0.97      0.97     10957

Train ORGANIZATION
Epoch 0:


100%|██████████| 500/500 [00:01<00:00, 438.21it/s, avg. loss=0.24] 


Epoch 1:


100%|██████████| 500/500 [00:01<00:00, 480.00it/s, avg. loss=0.163]


Epoch 2:


100%|██████████| 500/500 [00:01<00:00, 468.87it/s, avg. loss=0.128]


On test:


100%|██████████| 500/500 [00:00<00:00, 835.34it/s, avg. loss=0.106]


              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96      4965
         1.0       0.93      0.99      0.96      5031

    accuracy                           0.96      9996
   macro avg       0.96      0.96      0.96      9996
weighted avg       0.96      0.96      0.96      9996

Train EVENT
Epoch 0:


100%|██████████| 459/459 [00:01<00:00, 452.73it/s, avg. loss=0.366]


Epoch 1:


100%|██████████| 459/459 [00:00<00:00, 464.09it/s, avg. loss=0.272]


Epoch 2:


100%|██████████| 459/459 [00:00<00:00, 491.21it/s, avg. loss=0.215]


On test:


100%|██████████| 459/459 [00:00<00:00, 755.55it/s, avg. loss=0.161]


              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94      4598
         1.0       0.93      0.95      0.94      4577

    accuracy                           0.94      9175
   macro avg       0.94      0.94      0.94      9175
weighted avg       0.94      0.94      0.94      9175

Train DATE
Epoch 0:


100%|██████████| 618/618 [00:01<00:00, 464.15it/s, avg. loss=0.098] 


Epoch 1:


100%|██████████| 618/618 [00:01<00:00, 471.02it/s, avg. loss=0.0544]


Epoch 2:


100%|██████████| 618/618 [00:01<00:00, 468.11it/s, avg. loss=0.0375]


On test:


100%|██████████| 618/618 [00:00<00:00, 755.40it/s, avg. loss=0.0283]


              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      6151
         1.0       0.99      0.99      0.99      6192

    accuracy                           0.99     12343
   macro avg       0.99      0.99      0.99     12343
weighted avg       0.99      0.99      0.99     12343

Train COUNTRY
Epoch 0:


100%|██████████| 155/155 [00:00<00:00, 460.15it/s, avg. loss=0.15] 


Epoch 1:


100%|██████████| 155/155 [00:00<00:00, 440.45it/s, avg. loss=0.053] 


Epoch 2:


100%|██████████| 155/155 [00:00<00:00, 402.77it/s, avg. loss=0.0334]


On test:


100%|██████████| 155/155 [00:00<00:00, 873.99it/s, avg. loss=0.0263]


              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      1550
         1.0       0.98      1.00      0.99      1546

    accuracy                           0.99      3096
   macro avg       0.99      0.99      0.99      3096
weighted avg       0.99      0.99      0.99      3096

Train CITY
Epoch 0:


100%|██████████| 136/136 [00:00<00:00, 451.80it/s, avg. loss=0.192]


Epoch 1:


100%|██████████| 136/136 [00:00<00:00, 477.13it/s, avg. loss=0.0787]


Epoch 2:


100%|██████████| 136/136 [00:00<00:00, 463.01it/s, avg. loss=0.051]


On test:


100%|██████████| 136/136 [00:00<00:00, 631.21it/s, avg. loss=0.0256]


              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99      1350
         1.0       0.99      1.00      0.99      1353

    accuracy                           0.99      2703
   macro avg       0.99      0.99      0.99      2703
weighted avg       0.99      0.99      0.99      2703

Train NUMBER
Epoch 0:


100%|██████████| 106/106 [00:00<00:00, 453.38it/s, avg. loss=0.211]


Epoch 1:


100%|██████████| 106/106 [00:00<00:00, 454.68it/s, avg. loss=0.0708]


Epoch 2:


100%|██████████| 106/106 [00:00<00:00, 463.50it/s, avg. loss=0.0478]


On test:


100%|██████████| 106/106 [00:00<00:00, 838.16it/s, avg. loss=0.034]


              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      1069
         1.0       0.98      1.00      0.99      1048

    accuracy                           0.99      2117
   macro avg       0.99      0.99      0.99      2117
weighted avg       0.99      0.99      0.99      2117

Train AGE
Epoch 0:


100%|██████████| 139/139 [00:00<00:00, 385.85it/s, avg. loss=0.112]


Epoch 1:


100%|██████████| 139/139 [00:00<00:00, 465.65it/s, avg. loss=0.0284]


Epoch 2:


100%|██████████| 139/139 [00:00<00:00, 456.49it/s, avg. loss=0.0139]


On test:


100%|██████████| 139/139 [00:00<00:00, 695.22it/s, avg. loss=0.00686]


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1401
         1.0       1.00      1.00      1.00      1375

    accuracy                           1.00      2776
   macro avg       1.00      1.00      1.00      2776
weighted avg       1.00      1.00      1.00      2776

Train ORDINAL
Epoch 0:


100%|██████████| 50/50 [00:00<00:00, 453.61it/s, avg. loss=0.294]


Epoch 1:


100%|██████████| 50/50 [00:00<00:00, 457.76it/s, avg. loss=0.0571]


Epoch 2:


100%|██████████| 50/50 [00:00<00:00, 332.24it/s, avg. loss=0.0262]


On test:


100%|██████████| 50/50 [00:00<00:00, 828.39it/s, avg. loss=0.0161]


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       508
         1.0       1.00      1.00      1.00       486

    accuracy                           1.00       994
   macro avg       1.00      1.00      1.00       994
weighted avg       1.00      1.00      1.00       994

Train NATIONALITY
Epoch 0:


100%|██████████| 44/44 [00:00<00:00, 461.45it/s, avg. loss=0.293]


Epoch 1:


100%|██████████| 44/44 [00:00<00:00, 486.16it/s, avg. loss=0.0633]


Epoch 2:


100%|██████████| 44/44 [00:00<00:00, 476.22it/s, avg. loss=0.0319]


On test:


100%|██████████| 44/44 [00:00<00:00, 672.23it/s, avg. loss=0.0136]


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       442
         1.0       1.00      1.00      1.00       424

    accuracy                           1.00       866
   macro avg       1.00      1.00      1.00       866
weighted avg       1.00      1.00      1.00       866

Train FACILITY
Epoch 0:


100%|██████████| 83/83 [00:00<00:00, 478.97it/s, avg. loss=0.359]


Epoch 1:


100%|██████████| 83/83 [00:00<00:00, 485.36it/s, avg. loss=0.19] 


Epoch 2:


100%|██████████| 83/83 [00:00<00:00, 377.85it/s, avg. loss=0.143]


On test:


100%|██████████| 83/83 [00:00<00:00, 773.50it/s, avg. loss=0.101] 


              precision    recall  f1-score   support

         0.0       0.99      0.94      0.97       836
         1.0       0.94      1.00      0.97       817

    accuracy                           0.97      1653
   macro avg       0.97      0.97      0.97      1653
weighted avg       0.97      0.97      0.97      1653

Train STATE_OR_PROVINCE
Epoch 0:


100%|██████████| 37/37 [00:00<00:00, 452.19it/s, avg. loss=0.327]


Epoch 1:


100%|██████████| 37/37 [00:00<00:00, 393.08it/s, avg. loss=0.0918]


Epoch 2:


100%|██████████| 37/37 [00:00<00:00, 468.00it/s, avg. loss=0.0425]


On test:


100%|██████████| 37/37 [00:00<00:00, 791.06it/s, avg. loss=0.0272]


              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00       364
         1.0       0.99      1.00      1.00       360

    accuracy                           1.00       724
   macro avg       1.00      1.00      1.00       724
weighted avg       1.00      1.00      1.00       724

Train LAW
Epoch 0:


100%|██████████| 96/96 [00:00<00:00, 460.55it/s, avg. loss=0.377]


Epoch 1:


100%|██████████| 96/96 [00:00<00:00, 472.65it/s, avg. loss=0.211]


Epoch 2:


100%|██████████| 96/96 [00:00<00:00, 393.91it/s, avg. loss=0.154]


On test:


100%|██████████| 96/96 [00:00<00:00, 808.72it/s, avg. loss=0.0982]


              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       956
         1.0       0.97      0.98      0.98       955

    accuracy                           0.98      1911
   macro avg       0.98      0.98      0.98      1911
weighted avg       0.98      0.98      0.98      1911

Train AWARD
Epoch 0:


100%|██████████| 68/68 [00:00<00:00, 484.80it/s, avg. loss=0.354]


Epoch 1:


100%|██████████| 68/68 [00:00<00:00, 469.30it/s, avg. loss=0.172]


Epoch 2:


100%|██████████| 68/68 [00:00<00:00, 485.43it/s, avg. loss=0.125]


On test:


100%|██████████| 68/68 [00:00<00:00, 817.60it/s, avg. loss=0.0814]


              precision    recall  f1-score   support

         0.0       0.99      0.96      0.97       673
         1.0       0.96      0.99      0.98       671

    accuracy                           0.97      1344
   macro avg       0.98      0.97      0.97      1344
weighted avg       0.98      0.97      0.97      1344

Train LOCATION
Epoch 0:


100%|██████████| 40/40 [00:00<00:00, 457.11it/s, avg. loss=0.422]


Epoch 1:


100%|██████████| 40/40 [00:00<00:00, 462.00it/s, avg. loss=0.185]


Epoch 2:


100%|██████████| 40/40 [00:00<00:00, 316.57it/s, avg. loss=0.113]


On test:


100%|██████████| 40/40 [00:00<00:00, 770.78it/s, avg. loss=0.0735]


              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98       395
         1.0       0.97      1.00      0.99       396

    accuracy                           0.98       791
   macro avg       0.99      0.98      0.98       791
weighted avg       0.99      0.98      0.98       791

Train IDEOLOGY
Epoch 0:


100%|██████████| 30/30 [00:00<00:00, 432.38it/s, avg. loss=0.417]


Epoch 1:


100%|██████████| 30/30 [00:00<00:00, 466.60it/s, avg. loss=0.145]


Epoch 2:


100%|██████████| 30/30 [00:00<00:00, 412.36it/s, avg. loss=0.0806]


On test:


100%|██████████| 30/30 [00:00<00:00, 731.94it/s, avg. loss=0.054]


              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       289
         1.0       0.99      0.99      0.99       307

    accuracy                           0.99       596
   macro avg       0.99      0.99      0.99       596
weighted avg       0.99      0.99      0.99       596

Train WORK_OF_ART
Epoch 0:


100%|██████████| 78/78 [00:00<00:00, 363.96it/s, avg. loss=0.292]


Epoch 1:


100%|██████████| 78/78 [00:00<00:00, 438.81it/s, avg. loss=0.108]


Epoch 2:


100%|██████████| 78/78 [00:00<00:00, 443.32it/s, avg. loss=0.0641]


On test:


100%|██████████| 78/78 [00:00<00:00, 526.65it/s, avg. loss=0.0333]


              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99       782
         1.0       0.99      1.00      0.99       766

    accuracy                           0.99      1548
   macro avg       0.99      0.99      0.99      1548
weighted avg       0.99      0.99      0.99      1548

Train PRODUCT
Epoch 0:


100%|██████████| 33/33 [00:00<00:00, 438.17it/s, avg. loss=0.398]


Epoch 1:


100%|██████████| 33/33 [00:00<00:00, 440.52it/s, avg. loss=0.136]


Epoch 2:


100%|██████████| 33/33 [00:00<00:00, 456.17it/s, avg. loss=0.0775]


On test:


100%|██████████| 33/33 [00:00<00:00, 673.71it/s, avg. loss=0.065]


              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98       323
         1.0       0.99      0.98      0.98       322

    accuracy                           0.98       645
   macro avg       0.98      0.98      0.98       645
weighted avg       0.98      0.98      0.98       645

Train CRIME
Epoch 0:


100%|██████████| 57/57 [00:00<00:00, 408.79it/s, avg. loss=0.37] 


Epoch 1:


100%|██████████| 57/57 [00:00<00:00, 466.01it/s, avg. loss=0.185]


Epoch 2:


100%|██████████| 57/57 [00:00<00:00, 467.34it/s, avg. loss=0.129]


On test:


100%|██████████| 57/57 [00:00<00:00, 770.98it/s, avg. loss=0.0873]


              precision    recall  f1-score   support

         0.0       0.99      0.96      0.97       571
         1.0       0.96      0.99      0.97       569

    accuracy                           0.97      1140
   macro avg       0.97      0.97      0.97      1140
weighted avg       0.97      0.97      0.97      1140

Train DISEASE
Epoch 0:


100%|██████████| 39/39 [00:00<00:00, 259.74it/s, avg. loss=0.289]


Epoch 1:


100%|██████████| 39/39 [00:00<00:00, 447.76it/s, avg. loss=0.0856]


Epoch 2:


100%|██████████| 39/39 [00:00<00:00, 439.87it/s, avg. loss=0.0415]


On test:


100%|██████████| 39/39 [00:00<00:00, 736.83it/s, avg. loss=0.0215]


              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00       393
         1.0       0.99      1.00      1.00       372

    accuracy                           1.00       765
   macro avg       1.00      1.00      1.00       765
weighted avg       1.00      1.00      1.00       765

Train TIME
Epoch 0:


100%|██████████| 48/48 [00:00<00:00, 444.65it/s, avg. loss=0.265]


Epoch 1:


100%|██████████| 48/48 [00:00<00:00, 410.65it/s, avg. loss=0.0943]


Epoch 2:


100%|██████████| 48/48 [00:00<00:00, 333.23it/s, avg. loss=0.0551]


On test:


100%|██████████| 48/48 [00:00<00:00, 777.53it/s, avg. loss=0.0281]


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       473
         1.0       1.00      1.00      1.00       471

    accuracy                           1.00       944
   macro avg       1.00      1.00      1.00       944
weighted avg       1.00      1.00      1.00       944

Train MONEY
Epoch 0:


100%|██████████| 47/47 [00:00<00:00, 444.37it/s, avg. loss=0.243]


Epoch 1:


100%|██████████| 47/47 [00:00<00:00, 293.84it/s, avg. loss=0.0692]


Epoch 2:


100%|██████████| 47/47 [00:00<00:00, 431.78it/s, avg. loss=0.036] 


On test:


100%|██████████| 47/47 [00:00<00:00, 773.74it/s, avg. loss=0.0188]


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       463
         1.0       1.00      1.00      1.00       465

    accuracy                           1.00       928
   macro avg       1.00      1.00      1.00       928
weighted avg       1.00      1.00      1.00       928

Train DISTRICT
Epoch 0:


100%|██████████| 16/16 [00:00<00:00, 390.85it/s, avg. loss=0.497]


Epoch 1:


100%|██████████| 16/16 [00:00<00:00, 431.15it/s, avg. loss=0.172]


Epoch 2:


100%|██████████| 16/16 [00:00<00:00, 395.79it/s, avg. loss=0.0862]


On test:


100%|██████████| 16/16 [00:00<00:00, 581.71it/s, avg. loss=0.0541]


              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99       148
         1.0       0.99      1.00      0.99       155

    accuracy                           0.99       303
   macro avg       0.99      0.99      0.99       303
weighted avg       0.99      0.99      0.99       303

Train PENALTY
Epoch 0:


100%|██████████| 18/18 [00:00<00:00, 416.23it/s, avg. loss=0.497]


Epoch 1:


100%|██████████| 18/18 [00:00<00:00, 415.25it/s, avg. loss=0.216]


Epoch 2:


100%|██████████| 18/18 [00:00<00:00, 405.65it/s, avg. loss=0.126]


On test:


100%|██████████| 18/18 [00:00<00:00, 742.08it/s, avg. loss=0.0853]


              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98       174
         1.0       0.97      1.00      0.98       178

    accuracy                           0.98       352
   macro avg       0.98      0.98      0.98       352
weighted avg       0.98      0.98      0.98       352

Train RELIGION
Epoch 0:


100%|██████████| 8/8 [00:00<00:00, 372.54it/s, avg. loss=0.556]


Epoch 1:


100%|██████████| 8/8 [00:00<00:00, 106.27it/s, avg. loss=0.246]


Epoch 2:


100%|██████████| 8/8 [00:00<00:00, 424.44it/s, avg. loss=0.108]


On test:


100%|██████████| 8/8 [00:00<00:00, 773.21it/s, avg. loss=0.0663]


              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        77
         1.0       1.00      0.99      0.99        71

    accuracy                           0.99       148
   macro avg       0.99      0.99      0.99       148
weighted avg       0.99      0.99      0.99       148

Train PERCENT
Epoch 0:


100%|██████████| 14/14 [00:00<00:00, 360.26it/s, avg. loss=0.421]


Epoch 1:


100%|██████████| 14/14 [00:00<00:00, 382.84it/s, avg. loss=0.084]


Epoch 2:


100%|██████████| 14/14 [00:00<00:00, 414.55it/s, avg. loss=0.0253]


On test:


100%|██████████| 14/14 [00:00<00:00, 730.47it/s, avg. loss=0.0136]


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       124
         1.0       1.00      1.00      1.00       137

    accuracy                           1.00       261
   macro avg       1.00      1.00      1.00       261
weighted avg       1.00      1.00      1.00       261

Train LANGUAGE
Epoch 0:


100%|██████████| 5/5 [00:00<00:00, 441.51it/s, avg. loss=0.648]


Epoch 1:


100%|██████████| 5/5 [00:00<00:00, 249.71it/s, avg. loss=0.385]


Epoch 2:


100%|██████████| 5/5 [00:00<00:00, 353.73it/s, avg. loss=0.227]


On test:


100%|██████████| 5/5 [00:00<00:00, 610.88it/s, avg. loss=0.158]


              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99        48
         1.0       1.00      0.98      0.99        47

    accuracy                           0.99        95
   macro avg       0.99      0.99      0.99        95
weighted avg       0.99      0.99      0.99        95

Train FAMILY
Epoch 0:


100%|██████████| 5/5 [00:00<00:00, 327.57it/s, avg. loss=0.654]


Epoch 1:


100%|██████████| 5/5 [00:00<00:00, 318.68it/s, avg. loss=0.421]


Epoch 2:


100%|██████████| 5/5 [00:00<00:00, 233.51it/s, avg. loss=0.28]


On test:


100%|██████████| 5/5 [00:00<00:00, 523.91it/s, avg. loss=0.213]


              precision    recall  f1-score   support

         0.0       0.98      0.95      0.97        44
         1.0       0.95      0.98      0.97        43

    accuracy                           0.97        87
   macro avg       0.97      0.97      0.97        87
weighted avg       0.97      0.97      0.97        87



In [87]:
train_json.iloc[42].sentences

'Теракт в Иерусалиме: убиты двое полицейских\nПогибшие в результате теракта полицейские\nВ пятницу, 14 июля 2017 года, в Восточном Иерусалиме рядом с Храмовой горой произошёл теракт.\n\nУтром трое неизвестных неожиданно начали стрелять возле Львиных ворот Старого города, после чего попытались убежать и скрыться в одной из мечетей на Храмовой горе.\n\nРаненых полицейских и пограничника госпитализировали в больницу Адаса Хар-ха-Цофим (בית החולים הדסה הר הצופים). Однако двое из них в полдень умерли.\n\nПосле нападения вход на Храмовую гору был закрыт, а все посетители\xa0— эвакуированы.\n\nПосле 11:30 появились сообщения о том, что вооруженное нападение на Храмовой горе совершили трое жителей израильского арабского города Умм эль-Фахм.\n\nВсе трое, по документам, носили одно имя\xa0— Мухаммад Джабарин. Им было 30, 20 и 19 лет. Полные имена террористов: Мухаммад Ахмад Мухаммад Джабарин, Мухаммад Хамад Абд аль-Латиф Джабарин и Мухаммад Ахмад Мафаль Джабарин.\n\nДвое из преступников за день 

In [108]:
class Pipeline:
    def __init__(self, bert_model, bert_tokenizer, models) -> None:
        self.bert_model: BertModel = bert_model
        self.bert_tokenizer: BertTokenizer = bert_tokenizer
        self.models = models

    def forward(self, x: str):
        tokenized = self.bert_tokenizer.batch_encode_plus(
            [x], add_special_tokens=True, return_tensors="pt", truncation=True, max_length=512
        )
        # if tokenized["input_ids"]
        if tokenized["input_ids"].shape[1] > 512:
            print("POMOGITE")
        output = self.bert_model(
            tokenized["input_ids"], attention_mask=tokenized["attention_mask"]
        )
        word_embeddings = output.last_hidden_state[0]

        tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
        tokens_lengths = list(map(lambda x: len(x.replace("#", "")), tokens))
        tokens_lengths[0] = 0
        tokens_lengths[-1] = 0

        nested_ners = []

        for ent in self.models:
            labels = self.models[ent](word_embeddings)
            labels = labels[1:-1]  # remove [CLS] and [SEP]
            c_seq = labels > 0.5
            i = 0
            # print(ent)
            # print(labels)
            # print(c_seq)
            while i < len(c_seq):
                if c_seq[i] == 0:
                    i += 1
                    continue

                j = i + 1
                while j < len(c_seq):
                    if c_seq[j] != c_seq[i]:
                        break

                    j += 1

                if i + 1 == j:
                    i += 1
                    continue

                nested_ners.append((i, j - 1, ent))
                i = j

        return nested_ners


pipe = Pipeline(ds.bert_model, ds.tokenizer, models)
# to_test = train_json.iloc[42]
# to_test = submission_json.iloc[2].senences
# result = pipe.forward(to_test)

# for l_idx, r_idx, ent in result:
#     print(f"{ent}: ({l_idx}, {r_idx}) --> {to_test[l_idx: r_idx + 1]}")

In [141]:
submission_json = pd.read_json("../data/test/test.jsonl", lines=True)
submission_json.head()

Unnamed: 0,senences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588


In [142]:
submission_json.shape

(65, 2)

In [143]:
submission_results = []
for i, (sent, id) in tqdm(submission_json.iterrows(), total=submission_json.shape[0]):
    results = pipe.forward(sent)
    submission_results.append({"id": id, "ners": results})
    

100%|██████████| 65/65 [00:24<00:00,  2.65it/s]


In [144]:
import json

with open("test.jsonl", "w") as f:
    for i in range(len(submission_results)):
        json.dump(submission_results[i], f)
        f.write("\n")
    

In [138]:
subm = pd.read_json("test.jsonl", lines=True)
subm

Unnamed: 0,id,ners
0,519,"[[1, 4, PERSON], [12, 14, PERSON], [39, 42, PE..."
1,520,"[[18, 21, PERSON], [36, 37, PERSON], [45, 48, ..."
2,521,"[[1, 2, PERSON], [9, 12, PERSON], [16, 17, PER..."
3,522,"[[26, 28, PERSON], [178, 180, PERSON], [266, 2..."
4,523,"[[4, 5, PERSON], [13, 14, PERSON], [21, 22, PE..."
...,...,...
60,579,"[[10, 12, PERSON], [30, 31, PERSON], [33, 34, ..."
61,580,"[[2, 3, PERSON], [12, 13, PERSON], [19, 21, PE..."
62,581,"[[9, 11, PERSON], [13, 14, PERSON], [16, 17, P..."
63,582,"[[4, 9, PERSON], [37, 39, PERSON], [47, 49, PE..."


In [139]:
test_json = pd.read_json("/home/konstfed/Documents/Study/NLP_course/NLP_assignments/assignment3/data/test/test.jsonl", lines=True)
test_json

Unnamed: 0,senences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588
...,...,...
60,ОБСЕ назвала референдум о статусе Крыма незако...,644
61,Египетского студента могут выслать из страны з...,645
62,Геннадий Онищенко отправлен в отставку\nГеннад...,646
63,Племянник Алишера Усманова разбился в ДТП\nВид...,647
