<a href="https://colab.research.google.com/github/IlyaGusev/NewsCausation/blob/main/BertCause.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Requirements

In [1]:
!pip install --upgrade transformers bertviz checklist

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.2 MB/s 
[?25hCollecting bertviz
  Downloading bertviz-1.2.0-py3-none-any.whl (156 kB)
[K     |████████████████████████████████| 156 kB 23.3 MB/s 
[?25hCollecting checklist
  Downloading checklist-0.0.11.tar.gz (12.1 MB)
[K     |████████████████████████████████| 12.1 MB 16.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.6 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 44.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |

# Data loading

In [2]:
!rm -rf ru_news_cause_v1.tsv*
!wget https://www.dropbox.com/s/kcxnhjzfut4guut/ru_news_cause_v1.tsv.tar.gz
!tar -xzvf ru_news_cause_v1.tsv.tar.gz

--2021-07-31 23:57:42--  https://www.dropbox.com/s/kcxnhjzfut4guut/ru_news_cause_v1.tsv.tar.gz
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.18, 2620:100:6032:18::a27d:5212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/kcxnhjzfut4guut/ru_news_cause_v1.tsv.tar.gz [following]
--2021-07-31 23:57:43--  https://www.dropbox.com/s/raw/kcxnhjzfut4guut/ru_news_cause_v1.tsv.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc9d2ba60d1aff86b827926cbce6.dl.dropboxusercontent.com/cd/0/inline/BTXhT6QRuuZtqo6wcYhiMKL_zVTjEHDOfEzWTbLg1V8T28hrkrE5Af5dpRPRygTxqtGautzNBDizxfQfW3OEfotBwrK5LDfE3dozC65ZAMVRLUX0NN40nk3EifF1lesVN3ivM8vjHulBFcwAE_uSGr1s/file# [following]
--2021-07-31 23:57:43--  https://uc9d2ba60d1aff86b827926cbce6.dl.dropboxusercontent.com/cd/0/inline/BTXhT6QRuuZtqo6wcYhiMKL_zVTjEHDOfEzWTbLg1V

In [3]:
!cat ru_news_cause_v1.tsv | wc -l
!head ru_news_cause_v1.tsv

3427
id	left_title	right_title	left_url	right_url	left_timestamp	right_timestamp	confidence	result	overlap
tg_98424	Разработчики TON запустили блокчейн-платформу без Дурова. Что всё это значит?	Блокчейн-платформу TON запустили без Павла Дурова и Telegram	https://hightech.fm/2020/05/08/free-ton-durov	https://www.rbc.ru/technology_and_media/07/05/2020/5eb3f1429a79470728554a3f	1588924784	1588843504	1.0	same	10
tg_97812	Путин и Нетаньяху провели телефонные переговоры	Телефонный разговор с Премьер-министром Израиля Биньямином Нетаньяху	https://russian.rt.com/world/news/744834-putin-netanyahu-peregovory	http://kremlin.ru/events/president/news/63317	1588922040	1588931100	1.0	same	10
tg_96412	Минтранс предложил платить авиакомпаниям по 365 рублей за каждого неперевезенного пассажира	Авиакомпаниям хотят выплатить по 365 рублей за каждого неперевезенного пассажира	https://www.kommersant.ru/doc/4340719	https://fedpress.ru/news/77/economy/2493196	1588911069	1588914420	1.0	same	10
tg_96271	В Татарс

# BertCause

## Training

In [4]:
import random
import torch
import numpy as np
import os

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(1337)

In [5]:
import csv

records = []
with open("ru_news_cause_v1.tsv", "r") as r:
    reader = csv.reader(r, delimiter="\t")
    header = next(reader)
    for row in reader:
        r = dict(zip(header, row))
        if float(r["confidence"]) < 0.69:
            continue
        result = r["result"]
        mapping = {
            "bad": 0,
            "rel": 0,
            "same": 0,
            "left_right_cause": 1,
            "left_right_cancel": 1,
            "right_left_cause": 2,
            "right_left_cancel": 2
        }
        if result not in mapping:
            continue
        r["label"] = mapping[result]
        records.append(r)

In [6]:
from collections import Counter

labels_counter = Counter([r["label"] for r in records])
labels_count = len(labels_counter)
labels_counter

Counter({0: 1957, 1: 706, 2: 763})

In [7]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class NewsPairsDataset(Dataset):
    def __init__(self, records, max_tokens, model_name, labels_count):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=False
        )
        self.max_tokens = max_tokens
        self.records = records
        self.labels_count = labels_count
    
    def __len__(self):
        return len(self.records)
    
    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["left_title"],
            text_pair=record["right_title"],
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation="longest_first",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        label = record.get("label", None)
        if label is not None:
            output["labels"] = torch.tensor(label)
        return output

In [8]:
from collections import defaultdict

records_by_source = defaultdict(list)
for r in records:
    source = r["id"].split("_")[0]
    records_by_source[source].append(r)

train_records, val_records, test_records = [], [], []
for _, source_records in records_by_source.items():
    source_records.sort(key=lambda x: min(x["left_timestamp"], x["right_timestamp"]))
    val_border = int(0.8 * len(source_records))
    test_border = int(0.9 * len(source_records))
    train_records.extend(source_records[:val_border])
    val_records.extend(source_records[val_border:test_border])
    test_records.extend(source_records[test_border:])

print(len(train_records))
print(len(val_records))
print(len(test_records))

2740
343
343


In [9]:
MODEL_NAME = "DeepPavlov/rubert-base-cased"
TOKENIZER_NAME = MODEL_NAME
MAX_TOKENS = 80

In [10]:
from torch.utils.data import DataLoader, RandomSampler

train_data = NewsPairsDataset(train_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)
val_data = NewsPairsDataset(val_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1649718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




In [11]:
for item in train_data:
    print(item)
    break

{'input_ids': tensor([  101, 35278, 10706, 11346,  1469,  9495,  3468, 60663,   851, 14245,
          869, 49010,   102,   815, 19843, 17514, 22415, 47023,  9551, 11031,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=labels_count)
model = model.to("cuda")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456796.0, style=ProgressStyle(descri…




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
!rm -rf checkpoints

In [14]:
from transformers import Trainer, TrainingArguments

EPOCHS = 8
EVAL_STEPS = 10
WARMUP_STEPS = 5
LR = 3e-05
BATCH_SIZE = 128
GRAD_ACCUM_STEPS = 1

training_args = TrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    report_to="none",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data
)

trainer.train()

***** Running training *****
  Num examples = 2740
  Num Epochs = 8
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 176


Step,Training Loss,Validation Loss
10,1.0243,0.963591
20,0.9448,0.833519
30,0.7822,0.642251
40,0.6489,0.582231
50,0.5298,0.466556
60,0.4337,0.420204
70,0.3661,0.371622
80,0.3281,0.381855
90,0.2526,0.338442
100,0.2213,0.330314


***** Running Evaluation *****
  Num examples = 343
  Batch size = 128
Saving model checkpoint to checkpoints/checkpoint-10
Configuration saved in checkpoints/checkpoint-10/config.json
Model weights saved in checkpoints/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 343
  Batch size = 128
Saving model checkpoint to checkpoints/checkpoint-20
Configuration saved in checkpoints/checkpoint-20/config.json
Model weights saved in checkpoints/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 343
  Batch size = 128
Saving model checkpoint to checkpoints/checkpoint-30
Configuration saved in checkpoints/checkpoint-30/config.json
Model weights saved in checkpoints/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 343
  Batch size = 128
Saving model checkpoint to checkpoints/checkpoint-40
Configuration saved in checkpoints/checkpoint-40/config.json
Model weights saved in checkpoints/checkpoint-40/pytorch_mo

TrainOutput(global_step=176, training_loss=0.37207126515832817, metrics={'train_runtime': 452.1689, 'train_samples_per_second': 48.477, 'train_steps_per_second': 0.389, 'total_flos': 901163455718400.0, 'train_loss': 0.37207126515832817, 'epoch': 8.0})

## Inference

In [15]:
import numpy as np

test_data = NewsPairsDataset(test_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)
y_true = [item["labels"].item() for item in test_data]
y_true = np.array(y_true, dtype=np.int32)
print(y_true)

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_versi

[0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 2 0 0 0 0 0 1 2 0 1 1
 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 2 1 2 2 2 0 1 0 0 0 0 0 0 1 1 1 0
 0 1 0 1 0 1 0 0 0 2 2 0 0 0 0 0 1 0 2 0 0 1 1 1 0 1 2 0 1 0 1 1 2 0 1 1 0
 2 2 1 1 1 2 1 1 0 1 0 1 0 0 2 0 2 1 2 0 1 2 2 2 0 0 2 2 2 0 0 0 2 1 0 0 2
 2 0 1 0 0 2 0 1 0 1 0 0 0 1 1 0 0 0 0 2 0 1 2 0 2 2 0 1 2 1 1 1 1 0 1 0 1
 0 0 0 1 0 1 0 1 0 0 2 1 1 0 2 0 1 2 2 2 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 1 1
 2 1 0 2 0 2 1 0 2 2 0 0 0 1 0 0 0 1 1 0 2 1 1 1 1 2 1 1 1 1 1 0 1 1 2 0 0
 1 0 2 1 1 2 0 1 0 0 1 2 1 1 0 1 1 2 2 0 0 0 1 2 2 1 2 0 0 1 1 0 1 2 0 1 1
 2 2 1 0 1 0 2 0 0 1]


In [16]:
y_pred = []
with torch.no_grad():
    for item in test_data:
        for key, value in item.items():
            item[key] = value.unsqueeze_(0).cuda()
        outputs = model(**item, return_dict=True)
        pred = torch.argmax(outputs.logits).item()
        y_pred.append(pred)
y_pred = np.array(y_pred)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 2 1 0 0 0 0 1 0 0 0 0 2 1 1 1 1 1 0 0 0 1 2 0 0 0 0 0 1 0 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 2 2 2 2 1 0 0 0 0 0 2 1 1 1 0
 0 0 0 1 0 0 1 0 0 2 0 2 0 0 0 0 1 0 2 0 0 1 1 0 0 1 2 0 0 0 1 0 0 0 1 0 0
 2 2 1 1 1 0 1 1 2 0 1 0 0 0 2 0 2 0 2 0 0 2 2 2 0 0 2 2 2 0 2 0 0 1 0 0 2
 2 0 1 1 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 1 2 0 2 2 0 0 2 1 1 0 0 0 0 0 0
 0 0 0 1 0 1 0 0 0 0 2 0 1 0 2 0 1 2 2 2 0 0 0 0 0 1 2 0 0 0 0 0 2 0 0 1 1
 2 1 0 0 2 2 0 0 2 2 0 0 0 1 0 0 0 1 0 0 2 1 1 0 1 0 1 1 1 1 0 0 2 1 0 0 0
 1 0 2 1 1 2 0 0 1 0 1 2 1 1 0 1 1 2 2 0 0 0 1 2 2 1 0 0 0 1 1 0 1 2 0 1 1
 2 2 1 0 1 0 0 0 2 1]


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       186
           1       0.87      0.72      0.79        96
           2       0.79      0.79      0.79        61

    accuracy                           0.82       343
   macro avg       0.82      0.80      0.81       343
weighted avg       0.82      0.82      0.82       343



## Interpretation

### Errors

In [18]:
for i, r in enumerate(test_records):
    mapping = {
        0: "not_cause",
        1: "left_right",
        2: "right_left"
    }
    if y_true[i] != y_pred[i]:
        print("LEFT:", r["left_title"])
        print("RIGHT:", r["right_title"])
        print("LABELS: true:{}, pred:{}".format(mapping[y_true[i]], mapping[y_pred[i]]))
        print()

LEFT: Жительница Бурятии, возможно, повторно заболела коронавирусом
RIGHT: В Бурятии не подтвердился случай повторного заболевания коронавирусом
LABELS: true:left_right, pred:not_cause

LEFT: Заболели десятки детей: все подробности вспышки коронавируса в интернате Киева
RIGHT: Более 80 человек зараженных: что известно о вспышке коронавируса в интернате Киева
LABELS: true:not_cause, pred:right_left

LEFT: Агата Муцениеце подала на развод с Павлом Прилучным
RIGHT: Заявившая о побоях Муцениеце официально подала на развод с Прилучным
LABELS: true:not_cause, pred:left_right

LEFT: В «Роскосмосе» отреагировали на планы Трампа по добыче ископаемых на Луне
RIGHT: «Роскосмос» приготовился к переговорам с США по Луне
LABELS: true:not_cause, pred:right_left

LEFT: Зеленский поставил Саакашвили две задачи
RIGHT: Саакашвили назвал задачи от Зеленского
LABELS: true:not_cause, pred:left_right

LEFT: Иностранные спортсмены и тренеры могут возвращаться в Россию
RIGHT: Иностранным спортсменам и тренерам

### BertViz

In [19]:
from bertviz import head_view
with torch.no_grad():
    for r in test_records:
        if r["label"] == 2:
            inputs = test_data.tokenizer.encode_plus(r["left_title"], r["right_title"], return_tensors='pt', add_special_tokens=True)
            token_type_ids = inputs['token_type_ids'].cuda()
            input_ids = inputs['input_ids'].cuda()
            outputs = model(input_ids, token_type_ids=token_type_ids, return_dict=True, output_attentions=True)
            attention = outputs.attentions
            tokens = test_data.tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
            head_view(attention, tokens)
            break

<IPython.core.display.Javascript object>

### Checklist

In [20]:
%%writefile checklist.json
{
    "lexicons": {
        "famous_male_last_name": ["Путин", "Песков", "Меладзе", "Мясников", "Макрон", "Порошенко", "Зеленский", "Медведев", "Алибасов", "Трамп", "Байден"],
        "location_city": ["в Москве", "в Самаре", "в Париже", "в Дзержинске", "во Владимире", "в Стамбуле", "в Санкт-Петербурге", "в Сочи", "в Чикаго", "в Косово", "в Токио"],
        "location_country": ["в России", "во Франции", "в США", "в Казахстане", "в Японии", "в Германии", "в Китае", "в Украине", "в Великобритании", "в Испании", "в РФ"],
        "past_male_tell_verb": ["сообщил", "рассказал", "заявил"],
        "present_male_refute_verb": ["опроверг", "отрицает"],
        "log_number": ["1", "2", "5", "10", "20", "50", "100", "1000"],
        "local_bad_event_gent": ["пожара", "взрыва", "ДТП", "аварии", "задержания террористов"],
        "local_bad_event_loct": ["пожаре", "взрыве", "ДТП", "аварии", "задержании террористов"],
        "global_bad_event_loct": ["вводе военного положения", "подорожании продуктов"],
        "bad_event_loct": ["пожаре", "взрыве", "ДТП", "аварии", "задержании террористов", "вводе военного положения", "подорожании продуктов"],
        "bad_reason": ["из-за коронавируса", "из-за гриппа", "из-за погоды", "из-за проблем", "из-за войны", "из-за жары", "из-за болезни", "из-за Путина", "из-за Китая"],
        "regulation": ["карантин", "комендантский час", "ограничения", "запрет"],
        "date_future_duration": ["до 2023 года", "до 1 марта 2026 года", "до 31 декабря"],
        "date_future_year": ["в 2021 году", "в следующем году"],
        "date_future": ["до 2023 года", "до 1 марта 2026 года", "до 31 декабря", "в 2021 году", "в следующем году"]
    },
    "diffs": {
        "refutation": {
            "left_right": {
                "right_template": "{famous_male_last_name1} {present_male_refute_verb} {news} о {bad_event_loct} {location_city}",
                "left_template": "{famous_male_last_name2} {past_male_tell_verb} о {bad_event_loct} {location_city}",
                "label": 1,
                "custom_params": {
                    "news": ["информацию", "новость", "сообщение"]
                }
            },
            "right_left": {
                "left_template": "{famous_male_last_name1} {present_male_refute_verb} {news} о {bad_event_loct} {location_city}",
                "right_template": "{famous_male_last_name2} {past_male_tell_verb} о {bad_event_loct} {location_city}",
                "label": 2,
                "custom_params": {
                    "news": ["информацию", "новость", "сообщение"]
                }
            }
        },
        "expert": {
            "same_left_right": {
                "left_template": "{location_city} {future_introduce} {regulation}",
                "right_template": "{expert}: {location_city} {future_introduce} {regulation}",
                "label": 0,
                "custom_params": {
                    "expert": ["эксперт", "власти", "кремль"],
                    "future_introduce": ["введут", "установят"]
                }
            },
            "same_right_left": {
                "left_template": "{expert}: {location_city} {future_introduce} {regulation}",
                "right_template": "{location_city} {future_introduce} {regulation}",
                "label": 0,
                "custom_params": {
                    "expert": ["эксперт", "власти", "кремль"],
                    "future_introduce": ["введут", "установят"]
                }
            }
        }
    }
}

Writing checklist.json


In [21]:
import random

from checklist.editor import Editor

def fl_capitalize(s):
    return s[0].upper() + s[1:]

def data_to_diff(data, global_reason, local_reason="", label=None):
    result = []
    for example in data:
        result.append({
            "left_title": fl_capitalize(example["left_title"]),
            "right_title": fl_capitalize(example["right_title"]),
            "local_reason": example.get("local_reason", local_reason),
            "global_reason": global_reason,
            "label": label
        })
    return result


def gen_diff_by_config(editor, global_reason, local_reason, config):
    left_template = config.pop("left_template")
    right_template = config.pop("right_template")
    custom_params = config.get("custom_params", dict())
    label = config.pop("label")
    examples = editor.template(
        {"left_title": left_template, "right_title": right_template},
        remove_duplicates=True,
        **custom_params
    )
    return data_to_diff(examples.data, global_reason, local_reason, label)


def gen_numbers_diff(editor):
    # Generate bad_event_genitive
    # mask_template = "На месте {mask} в Москве погибло 5 человек"
    # print(editor.suggest(mask_template)[:20])
    template = "На месте {local_bad_event_gent} {location_city} погибло {log_number} человек"
    examples = editor.template(
        {
            "title": template,
            "action": "{local_bad_event_gent}",
            "location": "{location_city}",
            "number": "{log_number}"
        },
        remove_duplicates=True
    )
    records = []
    for sentence1 in examples.data:
        for sentence2 in examples.data:
            number1 = sentence1["number"]
            number2 = sentence2["number"]
            is_same_location = sentence1["location"] == sentence2["location"]
            is_same_action = sentence1["action"] == sentence2["action"]
            if is_same_location and is_same_action and number1 < number2:
                rnd = random.random() < 0.5
                records.append({
                    "left_title": sentence1["title"] if rnd else sentence2["title"],
                    "right_title": sentence2["title"] if rnd else sentence1["title"],
                    "label": 1 if rnd else 2,
                    "global_reason": "numbers",
                    "local_reason": "local_bad_event_gent"
                })
    return records

def generate(config_path):
    editor = Editor(language="russian", model_name="xlm-roberta-large")
    with open(config_path, "r") as r:
        config = json.load(r)

    lexicons = config.pop("lexicons")
    for key, words in lexicons.items():
        editor.add_lexicon(key, words)
        editor.add_lexicon(key + "_capitalize", [s[0].upper() + s[1:] for s in words])

    diffs_config = config.pop("diffs", dict())
    diffs = []
    for global_reason, local_reasons in diffs_config.items():
        for local_reason, diff_config in local_reasons.items():
            diffs.extend(gen_diff_by_config(editor, global_reason, local_reason, diff_config))
    diffs.extend(gen_numbers_diff(editor))

    header = ["left_title", "right_title", "label", "global_reason", "local_reason"]
    records = [{key: r[key] for key in header} for r in diffs]
    return records

In [22]:
from collections import defaultdict

checklist_records = generate("checklist.json")
checklist_records_by_class = defaultdict(list)
for r in checklist_records:
    checklist_records_by_class[r["label"]].append(r)
final_checklist_records = []
for label, class_records in checklist_records_by_class.items():
    random.shuffle(class_records)
    final_checklist_records.extend(class_records[:500])
print(final_checklist_records[0])

{'left_title': 'Мясников сообщил о подорожании продуктов в Чикаго', 'right_title': 'Трамп опроверг информацию о подорожании продуктов в Чикаго', 'label': 1, 'global_reason': 'refutation', 'local_reason': 'left_right'}


In [23]:
import numpy as np

y_true = [item["label"] for item in final_checklist_records]
y_true = np.array(y_true, dtype=np.int32)
print(y_true)

[1 1 1 ... 0 0 0]


In [24]:
y_pred = []
checklist_data = NewsPairsDataset(final_checklist_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)
with torch.no_grad():
    for item in checklist_data:
        for key, value in item.items():
            item[key] = value.unsqueeze_(0).cuda()
        outputs = model(**item, return_dict=True)
        pred = torch.argmax(outputs.logits).item()
        y_pred.append(pred)
y_pred = np.array(y_pred)
print(y_pred)

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_versi

[1 1 1 ... 1 1 1]


In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.30      0.15      0.20       500
           1       0.69      0.74      0.72       500
           2       0.63      0.89      0.74       500

    accuracy                           0.60      1500
   macro avg       0.54      0.60      0.55      1500
weighted avg       0.54      0.60      0.55      1500



In [26]:
for i, r in enumerate(final_checklist_records):
    mapping = {
        0: "not_cause",
        1: "left_right",
        2: "right_left"
    }
    if y_true[i] != y_pred[i]:
        print("LEFT:", r["left_title"])
        print("RIGHT:", r["right_title"])
        print("LABELS: true:{}, pred:{}".format(mapping[y_true[i]], mapping[y_pred[i]]))
        print()

LEFT: Меладзе рассказал о аварии в Санкт-Петербурге
RIGHT: Алибасов отрицает информацию о аварии в Санкт-Петербурге
LABELS: true:left_right, pred:not_cause

LEFT: Меладзе рассказал о пожаре в Санкт-Петербурге
RIGHT: Байден отрицает новость о пожаре в Санкт-Петербурге
LABELS: true:left_right, pred:not_cause

LEFT: Медведев рассказал о аварии в Чикаго
RIGHT: Байден опроверг информацию о аварии в Чикаго
LABELS: true:left_right, pred:not_cause

LEFT: Порошенко рассказал о аварии в Париже
RIGHT: Байден опроверг новость о аварии в Париже
LABELS: true:left_right, pred:not_cause

LEFT: Макрон рассказал о взрыве в Париже
RIGHT: Алибасов отрицает новость о взрыве в Париже
LABELS: true:left_right, pred:not_cause

LEFT: Байден заявил о взрыве в Косово
RIGHT: Медведев отрицает информацию о взрыве в Косово
LABELS: true:left_right, pred:not_cause

LEFT: Песков рассказал о взрыве в Дзержинске
RIGHT: Алибасов отрицает информацию о взрыве в Дзержинске
LABELS: true:left_right, pred:not_cause

LEFT: Песко

# Saving

In [None]:
import shutil
import os

OUT_DIR = "ru_bert_cause"
if os.path.isdir(OUT_DIR):
    shutil.rmtree(OUT_DIR) 
model.save_pretrained(OUT_DIR)
train_data.tokenizer.save_pretrained(OUT_DIR)

Configuration saved in ru_bert_cause/config.json
Model weights saved in ru_bert_cause/pytorch_model.bin
tokenizer config file saved in ru_bert_cause/tokenizer_config.json
Special tokens file saved in ru_bert_cause/special_tokens_map.json


('ru_bert_cause/tokenizer_config.json',
 'ru_bert_cause/special_tokens_map.json',
 'ru_bert_cause/vocab.txt',
 'ru_bert_cause/added_tokens.json',
 'ru_bert_cause/tokenizer.json')

In [None]:
!cd ru_bert_cause && tar -czvf ru_bert_cause.tar.gz .

./
./special_tokens_map.json
./config.json
./tokenizer_config.json
./ru_bert_cause.tar.gz
./pytorch_model.bin
./tokenizer.json
./vocab.txt


# Telegram pairs inference

In [None]:
!wget https://www.dropbox.com/s/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz
!tar -xzvf tg_pairs.jsonl.tar.gz

--2021-07-31 14:38:07--  https://www.dropbox.com/s/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.18, 2620:100:6032:18::a27d:5212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz [following]
--2021-07-31 14:38:08--  https://www.dropbox.com/s/raw/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca9926be17b5b6caeab18871b26.dl.dropboxusercontent.com/cd/0/inline/BTVRc4UBpZTDpuk-v9WeTTsb-qnmwz53q6UUug3BGcsBRfgebqo6qWT6tSnhg2_06jCnTM1Mfi_nggaEHS8gfDGBBn3ajasQ5uVQ3aHhZys1TTwR1fmjdIFzX4SCYcJT7EIWzrWQXGodiph5nGaOaG_N/file# [following]
--2021-07-31 14:38:08--  https://uca9926be17b5b6caeab18871b26.dl.dropboxusercontent.com/cd/0/inline/BTVRc4UBpZTDpuk-v9WeTTsb-qnmwz53q6UUug3BGcsBRfgebqo6qWT6tSnh

In [None]:
!head -n 1 tg_pairs.jsonl

{"from_language": "en", "to_language": "en", "from_timestamp": 1587934800, "to_timestamp": 1587934800, "from_title": "Government Calls for Return of Premier League as Soon as Possible to Boost National Spirit", "to_title": "Premier League 'Project Restart': When Could the 19/20 Season Restart & Finish?", "from_url": "https://www.90min.com/posts/government-calls-for-return-of-premier-league-as-soon-as-possible-to-boost-national-spirit-01e6yc3w8ptr", "to_url": "https://www.90min.com/posts/premier-league-project-restart-when-could-the-19-20-season-restart-finish-01e6xwh1gp8v", "distance": 0.23587880211712153, "id": 2}


In [33]:
tg_records = []
with open("tg_pairs.jsonl", "r") as r:
    for line in r:
        r = json.loads(line)
        if not (r["from_language"] == r["to_language"] == "ru"):
            continue
        r["left_title"] = r.pop("from_title")
        r["right_title"] = r.pop("to_title")
        r["left_url"] = r.pop("from_url")
        r["right_url"] = r.pop("to_url")
        tg_records.append(r)

In [35]:
from tqdm.notebook import tqdm
from collections import Counter

tg_pairs_data = NewsPairsDataset(tg_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)

tg_labels = []
with torch.no_grad(): 
    for item in tqdm(tg_pairs_data):
        for key, value in item.items():
            item[key] = value.unsqueeze_(0).cuda()
        outputs = model(**item, return_dict=True)
        logits = outputs.logits.squeeze(0)
        label = torch.argmax(logits).item()
        prob = torch.sigmoid(logits[label]).item()
        tg_labels.append((label, prob))

labels_cntr = Counter()
for (label, prob), r in zip(tg_labels, tg_records):
    r["bert_label"] = label
    labels_cntr[label] += 1
    r["bert_confidence"] = prob

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_versi

HBox(children=(FloatProgress(value=0.0, max=55535.0), HTML(value='')))




In [36]:
print(labels_cntr.most_common())

[(0, 51149), (2, 3505), (1, 881)]


In [37]:
with open("ru_tg_pairs_with_bert.jsonl", "w") as w:
    for r in tg_records:
        w.write(json.dumps(r, ensure_ascii=False).strip() + "\n")

In [38]:
!rm -rf ru_tg_pairs_with_bert.jsonl.tar.gz
!tar -czvf ru_tg_pairs_with_bert.jsonl.tar.gz ru_tg_pairs_with_bert.jsonl

ru_tg_pairs_with_bert.jsonl


# NLI

In [None]:
!wget https://russiansuperglue.com/tasks/download/TERRa -O TERRa.zip

--2021-07-30 17:33:39--  https://russiansuperglue.com/tasks/download/TERRa
Resolving russiansuperglue.com (russiansuperglue.com)... 37.18.107.48
Connecting to russiansuperglue.com (russiansuperglue.com)|37.18.107.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 907346 (886K) [application/zip]
Saving to: ‘TERRa.zip’


2021-07-30 17:33:42 (608 KB/s) - ‘TERRa.zip’ saved [907346/907346]



In [None]:
!unzip TERRa.zip

Archive:  TERRa.zip
   creating: TERRa/
  inflating: TERRa/train.jsonl       
   creating: __MACOSX/
   creating: __MACOSX/TERRa/
  inflating: __MACOSX/TERRa/._train.jsonl  
  inflating: TERRa/.DS_Store         
  inflating: __MACOSX/TERRa/._.DS_Store  
  inflating: TERRa/test.jsonl        
  inflating: __MACOSX/TERRa/._test.jsonl  
  inflating: TERRa/val.jsonl         
  inflating: __MACOSX/TERRa/._val.jsonl  
  inflating: __MACOSX/._TERRa        


In [None]:
terra_train_records = []
terra_train_y_true = []
with open("TERRa/train.jsonl") as train:
    for line in train:
        r = json.loads(line)
        r["left_title"] = r.pop("premise")
        r["right_title"] = r.pop("hypothesis")
        r["label"] = int(r.pop("label") == "entailment")
        terra_train_y_true.append(r["label"])
        terra_train_records.append(r)

terra_val_records = []
terra_val_y_true = []
with open("TERRa/val.jsonl") as val:
    for line in val:
        r = json.loads(line)
        r["left_title"] = r.pop("premise")
        r["right_title"] = r.pop("hypothesis")
        r["label"] = int(r.pop("label") == "entailment")
        terra_val_y_true.append(r["label"])
        terra_val_records.append(r)

In [None]:
terra_train_data = NewsPairsDataset(terra_train_records, MAX_TOKENS, TOKENIZER_NAME, MODE)
terra_train_y_pred = []
with torch.no_grad():
    for item in terra_train_data:
        input_ids = item["input_ids"].unsqueeze(0).cuda()
        mask = item["attention_mask"].unsqueeze(0).cuda()
        outputs = model(input_ids, mask, return_dict=True)
        pred = torch.argmax(outputs.logits).item()
        terra_train_y_pred.append(pred == 1)
terra_train_y_pred = np.array(terra_train_y_pred)
print(terra_train_y_pred)

[0.04276323 0.06221786 0.03539146 ... 0.2690801  0.07501597 0.01802206]


In [None]:
terra_val_data = NewsPairsDataset(terra_val_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)
terra_val_y_pred = []
with torch.no_grad():
    for item in terra_val_data:
        input_ids = item["input_ids"].unsqueeze(0).cuda()
        mask = item["attention_mask"].unsqueeze(0).cuda()
        outputs = model(input_ids, mask, return_dict=True)
        pred = torch.argmax(outputs.logits).item()
        terra_val_y_pred.append(int(pred > 0))
terra_val_y_pred = np.array(terra_val_y_pred)
print(terra_val_y_pred)

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_versi

[0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
 0 1 0 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 1 0 0 1 0]


In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(terra_val_y_true, terra_val_y_pred))

0.5211726384364821
