In [1]:
import pandas as pd
import numpy as np
import os, sys

import transformers
import torch
import csv

from tqdm import tqdm 

from transformers import AutoTokenizer, AutoModel

In [2]:
torch.cuda.is_available()

True

In [3]:
from sklearn.metrics import f1_score, precision_score, recall_score

def symbol_wize(y_true, y_pred):
    y_true1, y_pred1 = y_true.split(), y_pred.split()
    y_true, y_pred = set(), set()
    
    eps=1e-7
    for i in y_true1:
        y_true.update(set(range(int(i.split(':')[0]), int(i.split(':')[1]))))
    for i in y_pred1:
        y_pred.update(set(range(int(i.split(':')[0]), int(i.split(':')[1]))))
    
    true_pos = y_true.intersection(y_pred)
    false_neg = y_true.difference(y_pred)
    false_pos = y_pred.difference(y_true)
    
    precision = (len(true_pos)+eps)/(len(true_pos) + len(false_pos)+eps)
    recall = (len(true_pos)+eps)/(len(true_pos) + len(false_neg)+eps)
    
    f1_score = 2*(precision*recall)/(precision + recall + 1e-7)
    return f1_score

def get_rank(true_class, pred_class, true_span, pred_span):
    if true_class == pred_class == 1:
        return symbol_wize(true_span, pred_span)
    elif true_class == pred_class == 0:
        return 1
    else:
        return 0

def gapping_metrics(true_data, pred_data, only_class=False):
    f1_class = f1_score(true_data['class'].values, pred_data['class'].values)
    
    f1_symbolwise_score = 0
    if not only_class:
        f1_symbolwise_scores = []
        for tag in ['cV', 'V', 'cR1', 'cR2', 'R1', 'R2']:
            f1_symbolwise_scores += [get_rank(true_data.iloc[i]['class'], pred_data.iloc[i]['class'], true_data.iloc[i][tag], pred_data.iloc[i][tag]) for i in range(len(true_data))]
        f1_symbolwise_score = np.mean(f1_symbolwise_scores)
    
    return {'f1_score': f1_class, 'f1_symbolwise_score': f1_symbolwise_score}


# Data

In [4]:
train_df = pd.read_csv(os.path.join('data', 'train', 'train.csv'), sep="\t", quoting=csv.QUOTE_NONE)
train_dict = train_df.to_dict("records")

In [5]:
test_df = pd.read_csv(os.path.join('data', 'test', 'test_gold_standard.csv'), sep="\t", quoting=csv.QUOTE_NONE)
test_dict = test_df.to_dict("records")

In [8]:
test_df

Unnamed: 0,text,class,cV,cR1,cR2,V,R1,R2
0,Изобретение относится к судостроению и касаетс...,0,,,,,,
1,"Эти состояния называют фазами воды, а превраще...",1,14:22,0:13,23:34,81:81,38:78,81:100
2,И должен ни единой долькой Не отступаться от ...,0,,,,,,
3,Он потребовал обеспечить полное осуществление ...,0,,,,,,
4,"По мнению местного пастора Элла Эбанкса, запре...",0,,,,,,
...,...,...,...,...,...,...,...,...
2040,"Кто-то ходит в кино с девушкой, а кто-то со ст...",1,7:12,0:6,20:30,41:41,34:40,41:53
2041,Восстановление показателей банка будет идти бо...,0,,,,,,
2042,"Вы кое-что смыслите, в лабораторной технике; в...",0,,,,,,
2043,"Если готовка не ваш конек, то вас выручить наш...",0,,,,,,


In [88]:
NAME_MODEL = 'sberbank-ai/ruRoberta-large'

In [89]:
tokenizer = AutoTokenizer.from_pretrained(NAME_MODEL,
                                                    truncation=True,
                                                    padding=True)

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [90]:
TAGS = ["[NONE]", "cV", "cR1", "cR2", "R1", "R2"]
TAG2ID = {v: k for k, v in enumerate(TAGS)}

GAPS = ["[NONE]", "V"]
GAP2ID = {v: k for k, v in enumerate(GAPS)}

In [100]:
def make_text_data(train_dict):
    text_data = []

    for sample in tqdm(train_dict):
        text = sample['text']
        text = text.replace("—", "-")

        tokenizer_out = tokenizer(text, padding="max_length", truncation=True, max_length=128)
        word_ids = tokenizer_out.word_ids()
        tokens_ids = tokenizer_out.input_ids
        tokens = tokenizer_out.tokens()
        #print(tokens)

        tokens_borders = []
        for i in range(0, len(tokens_ids)):
            #print(tokens[i], tokens_ids[i])
            if tokens_ids[i] == 0 or tokens[i] == '[SEP]' or tokens[i] == '[CLS]' or tokens[i] == '<s>' or tokens[i] == '</s>':
                tokens_borders.append([-1, -1])
            else:
                token_border = tokenizer_out.token_to_chars(i)
                tokens_borders.append([token_border.start, token_border.end])

        tags_borders = []
        for tag in TAGS[1:]:
            if not pd.isna(sample[tag]):
                for border in sample[tag].split(" "):
                    left, right = list(map(int, border.split(":")))
                    tags_borders.append((tag, left, right))

        tags = []
        for token_left, token_right in tokens_borders:

            if token_left == -1 and token_right == -1:
                tags.append(TAGS[0])
                continue

            flag = False
            for tag, tag_left, tag_right in tags_borders:
                if tag_left <= token_left and token_right <= tag_right:
                    tags.append(tag)
                    flag = True

            if not flag:
                tags.append(TAGS[0])   

        gap_index = []
        if not pd.isna(sample["V"]):
            for borders in sample["V"].split(" "):
                left, right = list(map(int, borders.split(":")))
                gap_index.append(left)

        gaps = [TAGS[0]] * len(tokens_ids)
        for tag_left in gap_index:
            flag = False
            for i, (left, right) in enumerate(tokens_borders):
                if tag_left == left and not flag:
                    gaps[i] = "V"
                    flag = True

        #tags = [TAGS[0]] + tags + [TAGS[0]]
        #gaps = [TAGS[0]] + gaps + [TAGS[0]]
        #tokens_borders = [[-1, -1]] + tokens_borders + [[-1, -1]]
        tags_ids = [TAG2ID[tag] for tag in tags]
        gaps_ids = [GAP2ID[gap] for gap in gaps]
        attention_mask = tokenizer_out['attention_mask']
        label = int(sample['class'])

        text_data.append({
            'tokens_ids': tokens_ids,
            'attention_mask': attention_mask,
            'tags_ids': tags_ids,
            'gaps_ids': gaps_ids,
            'label': label,
            'tokens_borders': tokens_borders
        })
    
    return text_data

In [101]:
class ARRGDataset(torch.utils.data.Dataset):
    def __init__(self, text_data):
        self.text_data = text_data
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val, dtype=torch.long) for key, val in self.text_data[idx].items()}
        return item

    def __len__(self):
        return len(self.text_data)

In [102]:
train_data = make_text_data(train_dict)
arrg_dataset_train = ARRGDataset(train_data)

100%|██████████| 16406/16406 [00:11<00:00, 1483.17it/s]


In [103]:
test_data = make_text_data(test_dict)
arrg_dataset_test = ARRGDataset(test_data)

100%|██████████| 2045/2045 [00:00<00:00, 2150.88it/s]


In [98]:
arrg_dataset_test[0]

{'tokens_ids': tensor([   101,  19597,  33276,  40035,  16241,  34870,    551,  36681,  33580,
          76688,  22325,    549,  56280,  12016,  14317,  32243,  46672,  90068,
          32875,  91258,  53204,  63596,  10227,  94826,  86079,    117,  14028,
            570,  18705,  11078,  16111,  16583,  84190,  10625,  10913,  30148,
          23879,  35459,    543,  11613,  15755,  12861,  10122,  63596,  45803,
          20265,    549, 105805,  83013,  30148,  57935,  19364,  10970,  10439,
          10517,  10267,    549,    556,  29952,  83856,  10353,    552,  60545,
          10433,    117,  58742,  36069,  10385,  13686,  53204,  10990,  32532,
          35131,    119,    102,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              

# Model

In [12]:
from transformers import BertTokenizerFast, BertModel, BertConfig, AutoConfig, AutoModel, DistilBertConfig
from transformers import BertTokenizerFast, BertForTokenClassification, BertForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [109]:
import torch.nn as nn

class BertAgrrModel(transformers.PreTrainedModel):

    def __init__(self, name='DeepPavlov/rubert-base-cased-sentence'):
        super(BertAgrrModel, self).__init__(config=AutoConfig.from_pretrained(name, output_last_hidden_state=True))
        #config = BertConfig.from_pretrained("distilbert-base-uncased", output_last_hidden_state=True)
        self.bert = AutoModel.from_pretrained(name)
        
        self.dropout = nn.Dropout(0.1)
        self.sentence_classifier = nn.Linear(1024, 1)
        self.full_annotation_classifier = nn.Linear(1024, 6)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask)
        
        sequence_output = output.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        #print(sequence_output[:, :1, :].size())
        pooled_output = sequence_output[:, 0, :]
        #print(output.last_hidden_state.size(), pooled_output.size())
        
        #sequence_output, pooled_output = self.dropout(sequence_output), self.dropout(pooled_output)
        #print(sequence_output[0])
        
        sentence_logits = self.sentence_classifier(pooled_output)
        full_annotation_logits = self.full_annotation_classifier(sequence_output)
        sentence_probs = torch.sigmoid(sentence_logits)
        full_annotation_probs = torch.nn.functional.softmax(full_annotation_logits, dim=2)
        
        return {
            'sentence_logits': sentence_logits,
            'full_annotation_logits': full_annotation_logits,
            'sentence_probs': sentence_probs,
            'full_annotation_probs': full_annotation_probs
        }

In [157]:
out = model(arrg_dataset_train[1]['tokens_ids'], arrg_dataset_train[1]['attention_mask'])

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [158]:
inputs_tmp = tokenizer("Hello, my dog is cute", return_tensors="pt")
inputs_tmp

{'input_ids': tensor([[  101, 31178,   117, 15127, 17835, 10124, 21610, 10112,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

# Train

In [20]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [110]:
#from transformers import AdamW
from torch.optim import AdamW
from transformers import get_scheduler

In [111]:
EPOCH = 10
device = 'cuda:4'

In [112]:
model = BertAgrrModel(NAME_MODEL).to(device)

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to b

In [113]:
from torch.utils.data import Dataset, DataLoader
train_dataloader = DataLoader(arrg_dataset_train, shuffle=True, batch_size=24)
test_dataloader = DataLoader(arrg_dataset_test, shuffle=False, batch_size=4)

In [114]:
criterion = torch.nn.BCELoss()
optimizer = AdamW(model.parameters(), lr=1e-5)
#num_training_steps = EPOCH * len(train_dataloader)

In [115]:
f1_scores = []
all_loss = []

model.train()
for epoch in range(EPOCH):
    num_batches = 0
    losses = []
    
    model.eval()
    print('EPOCH: {} Starting eval...'.format(epoch+1))
    pred_labeles = []
    logits = []
    for batch in tqdm(test_dataloader):
        token_ids = batch['tokens_ids']
        mask = batch['attention_mask']
        labels = batch['label']
        tags = batch['tags_ids']
        #print(token_ids, mask)
        out = model(token_ids.to(device), mask.to(device))

        probs = out['sentence_probs'].detach().to('cpu').numpy().flatten()
        logits += list(probs)
        probs[probs >= 0.5] = 1
        probs[probs < 0.5] = 0
        pred_labeles += list(probs)

    pred_df = pd.DataFrame({
        'class': pred_labeles,
        'cV': [0]*len(pred_labeles),
        'cR1': [0]*len(pred_labeles),
        'cR2': [0]*len(pred_labeles),
        'R1': [0]*len(pred_labeles),
        'R2': [0]*len(pred_labeles)
    })
    #print(logits)
    #print(pred_df)
    #print(test_df)

    metrics = gapping_metrics(test_df, pred_df, only_class=True)
    print('Eval metrics:', metrics)
    f1_scores.append(metrics)

    model.train()
    
    for i, batch in enumerate(train_dataloader):
        token_ids = batch['tokens_ids']
        mask = batch['attention_mask']
        labels = batch['label']
        tags = batch['tags_ids']
        
        labels = labels.type(torch.float32).to(device)
        out = model(token_ids.to(device), mask.to(device))
        
        #print(torch.squeeze(out['sentence_logits']))
        loss = criterion(torch.squeeze(out['sentence_probs']), labels)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad()
        
        if (i+1) % 64 == 0:
            print('Step: {}, Loss: {}'.format(i+1, np.mean(losses)))
            all_loss.append(np.mean(losses))
            losses = []

EPOCH: 1 Starting eval...


100%|██████████| 512/512 [00:13<00:00, 38.75it/s]


Eval metrics: {'f1_score': 0.39740119112073635, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.43169857980683446
Step: 128, Loss: 0.19843737816700013
Step: 192, Loss: 0.11548110321746208
Step: 256, Loss: 0.08184751464432338
Step: 320, Loss: 0.07952318972820649
Step: 384, Loss: 0.07970733606998692
Step: 448, Loss: 0.07972718431119574
Step: 512, Loss: 0.07541884650709108
Step: 576, Loss: 0.06741264771699207
Step: 640, Loss: 0.07584255089750513
EPOCH: 2 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.39it/s]


Eval metrics: {'f1_score': 0.9480337078651686, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.047294343981775455
Step: 128, Loss: 0.03343502437655843
Step: 192, Loss: 0.03488314957030525
Step: 256, Loss: 0.04183571723115165
Step: 320, Loss: 0.03297141372695478
Step: 384, Loss: 0.032480388439580565
Step: 448, Loss: 0.032637910577250295
Step: 512, Loss: 0.04767977667142986
Step: 576, Loss: 0.0509710059104691
Step: 640, Loss: 0.04645107939722948
EPOCH: 3 Starting eval...


100%|██████████| 512/512 [00:13<00:00, 38.58it/s]


Eval metrics: {'f1_score': 0.9740932642487046, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.018530450896832917
Step: 128, Loss: 0.024552059210691368
Step: 192, Loss: 0.022919905437447596
Step: 256, Loss: 0.02443382360797841
Step: 320, Loss: 0.018102619632372807
Step: 384, Loss: 0.015974085863490473
Step: 448, Loss: 0.03705477924995648
Step: 512, Loss: 0.019514679586791317
Step: 576, Loss: 0.015781521479766525
Step: 640, Loss: 0.020294165648010676
EPOCH: 4 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.78it/s]


Eval metrics: {'f1_score': 0.9794117647058823, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.00872164196380254
Step: 128, Loss: 0.017134509334027825
Step: 192, Loss: 0.01742696803421495
Step: 256, Loss: 0.0109619172424118
Step: 320, Loss: 0.018577032934899762
Step: 384, Loss: 0.013888313113966433
Step: 448, Loss: 0.018743955640729837
Step: 512, Loss: 0.02044701982413244
Step: 576, Loss: 0.013996967319599207
Step: 640, Loss: 0.02046492751514961
EPOCH: 5 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.63it/s]


Eval metrics: {'f1_score': 0.9539333805811482, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.010591029603801871
Step: 128, Loss: 0.012947628257506949
Step: 192, Loss: 0.014381381871203303
Step: 256, Loss: 0.005677099461763646
Step: 320, Loss: 0.015851959404471927
Step: 384, Loss: 0.003235668186334806
Step: 448, Loss: 0.01884469546780565
Step: 512, Loss: 0.016279509540254367
Step: 576, Loss: 0.021050382685189106
Step: 640, Loss: 0.02316238005141713
EPOCH: 6 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.73it/s]


Eval metrics: {'f1_score': 0.9748520710059171, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.004882376497675978
Step: 128, Loss: 0.010391893971245736
Step: 192, Loss: 0.002287986666942743
Step: 256, Loss: 0.009528517511512291
Step: 320, Loss: 0.021787045401879368
Step: 384, Loss: 0.010240421092021279
Step: 448, Loss: 0.012304847286713994
Step: 512, Loss: 0.010450490365087717
Step: 576, Loss: 0.016165025238137787
Step: 640, Loss: 0.010709836605201417
EPOCH: 7 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.83it/s]


Eval metrics: {'f1_score': 0.9772893772893773, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.0015368571311000778
Step: 128, Loss: 0.008778041982054674
Step: 192, Loss: 0.00596420283579846
Step: 256, Loss: 0.010154633904335242
Step: 320, Loss: 0.0023224200679123896
Step: 384, Loss: 0.011636080278265126
Step: 448, Loss: 0.007389838272729321
Step: 512, Loss: 0.024929474849272992
Step: 576, Loss: 0.023473969465158007
Step: 640, Loss: 0.015975149884980056
EPOCH: 8 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.72it/s]


Eval metrics: {'f1_score': 0.9808541973490426, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.0077903881995098345
Step: 128, Loss: 0.009944026243829285
Step: 192, Loss: 0.011886466596706668
Step: 256, Loss: 0.027510089993484144
Step: 320, Loss: 0.03388034669603712
Step: 384, Loss: 0.22922325442596048
Step: 448, Loss: 0.7217311272397637
Step: 512, Loss: 0.6412523603066802
Step: 576, Loss: 0.6585872732102871
Step: 640, Loss: 0.6317782048135996
EPOCH: 9 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.55it/s]


Eval metrics: {'f1_score': 0.8474576271186441, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.1581112303683767
Step: 128, Loss: 0.09823296128888614
Step: 192, Loss: 0.041417436754272785
Step: 256, Loss: 0.03580755866460095
Step: 320, Loss: 0.029602047197840875
Step: 384, Loss: 0.017332134325442894
Step: 448, Loss: 0.03483289979385518
Step: 512, Loss: 0.17107464175205678
Step: 576, Loss: 0.280030480469577
Step: 640, Loss: 0.14356238367327023
EPOCH: 10 Starting eval...


100%|██████████| 512/512 [00:09<00:00, 52.59it/s]


Eval metrics: {'f1_score': 0.9774872912127814, 'f1_symbolwise_score': 0}
Step: 64, Loss: 0.01685074011038523
Step: 128, Loss: 0.01614201113989111
Step: 192, Loss: 0.018197376672105747
Step: 256, Loss: 0.034175646868789045
Step: 320, Loss: 0.03508676506498887
Step: 384, Loss: 0.024866544133146817
Step: 448, Loss: 0.023352856333076488
Step: 512, Loss: 0.017216839281445573
Step: 576, Loss: 0.014935805354070908
Step: 640, Loss: 0.016104615825042856


In [53]:
#all_metrics = {}

In [116]:
all_metrics[NAME_MODEL] = {'loss': all_loss, 'scores': f1_scores}

In [117]:
all_metrics.keys()

dict_keys(['distilbert-base-multilingual-cased', 'DeepPavlov/rubert-base-cased-sentence', 'bert-base-multilingual-uncased', 'sberbank-ai/ruRoberta-large'])

In [118]:
import pickle
with open('all_metrics.pickle', 'wb') as f:
    pickle.dump(all_metrics, f)

In [34]:
metrics['deeppavlov_rubert'] = {'loss': all_loss, 'scores': f1_scores}

In [35]:
gapping_metrics(pred_df, test_df)

AttributeError: 'numpy.float64' object has no attribute 'split'

In [168]:
pred_df

Unnamed: 0,class,cV,cR1,cR2,R1,R2
0,0.238321,0,0,0,0,0
1,0.238321,0,0,0,0,0
2,0.238321,0,0,0,0,0
3,0.238321,0,0,0,0,0
4,0.238321,0,0,0,0,0
...,...,...,...,...,...,...
2040,0.238321,0,0,0,0,0
2041,0.238321,0,0,0,0,0
2042,0.238321,0,0,0,0,0
2043,0.238321,0,0,0,0,0


In [119]:
!git clone https://github.com/AlexeySorokin/Gapping.git

Cloning into 'Gapping'...
remote: Enumerating objects: 115, done.[K
remote: Total 115 (delta 0), reused 0 (delta 0), pack-reused 115[K
Receiving objects: 100% (115/115), 13.20 MiB | 11.25 MiB/s, done.
Resolving deltas: 100% (1/1), done.
