# Requirements

In [None]:
!pip install --upgrade transformers bertviz checklist

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.9 MB/s 
[?25hCollecting bertviz
  Downloading bertviz-1.2.0-py3-none-any.whl (156 kB)
[K     |████████████████████████████████| 156 kB 69.6 MB/s 
[?25hCollecting checklist
  Downloading checklist-0.0.11.tar.gz (12.1 MB)
[K     |████████████████████████████████| 12.1 MB 33.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 56.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 68.9 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |██████

# Data loading

In [None]:
!rm -rf ru_news_cause_v2.tsv*
!wget https://www.dropbox.com/s/m1kb0dn6q5mcb6v/ru_news_cause_v2.tsv.tar.gz
!tar -xzvf ru_news_cause_v2.tsv.tar.gz

!rm -rf en_news_cause_v0.tsv*
!wget https://www.dropbox.com/s/7w4zf8ogqonwb8i/en_news_cause_v0.tsv.tar.gz
!tar -xzvf en_news_cause_v0.tsv.tar.gz

--2021-08-07 16:06:34--  https://www.dropbox.com/s/m1kb0dn6q5mcb6v/ru_news_cause_v2.tsv.tar.gz
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/m1kb0dn6q5mcb6v/ru_news_cause_v2.tsv.tar.gz [following]
--2021-08-07 16:06:34--  https://www.dropbox.com/s/raw/m1kb0dn6q5mcb6v/ru_news_cause_v2.tsv.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucb4c590fe4d9a68acc0d3534cc6.dl.dropboxusercontent.com/cd/0/inline/BTywhfHb3gYGROEsjBPcw9PkovW5sDjPaq-yrM1okWIrUdF-T-Yz4QBMGk7_N5tMuE1epr-19XiN01KkgVB7jFOOjgXYoBDbX9FX_o3h4NJBjR-hQ1j7LUDSgysGvPxvZphYAaqS46HPhdwRhyT0YxgX/file# [following]
--2021-08-07 16:06:35--  https://ucb4c590fe4d9a68acc0d3534cc6.dl.dropboxusercontent.com/cd/0/inline/BTywhfHb3gYGROEsjBPcw9PkovW5sDjPaq-yrM1okWIrU

In [None]:
!cat ru_news_cause_v2.tsv | wc -l
!head ru_news_cause_v2.tsv

!cat en_news_cause_v0.tsv | wc -l
!head en_news_cause_v0.tsv

5406
id	left_title	right_title	left_url	right_url	left_timestamp	right_timestamp	confidence	result	overlap	mv_part
tg_9993	Рабочие устроили бунт на базе «Газпрома» для «Силы Сибири»	«Газпром» ответил на сообщения о бунте вахтовиков	https://lenta.ru/news/2020/04/28/bund/	https://lenta.ru/news/2020/04/28/gazprom_protest/	1588032000	1588032000	1.0	left_right_cause	10	1.0
tg_99296	Трехлетняя девочка выжила после падения с восьмого этажа в Электростали	Трехлетняя девочка выпала с восьмого этажа и выжила в Электростали	https://iz.ru/1008987/2020-05-08/trekhletniaia-devochka-vyzhila-posle-padeniia-s-vosmogo-etazha-v-elektrostali	https://mosregtoday.ru/sec/trehletnyaya-devochka-vypala-s-vosmogo-etazha-i-vyzhila-v-elektrostali/	1588928652	1588921081	1.0	same	10	1.0
tg_99290	МВД: В результате стрельбы на улице Павла Андреева никто не пострадал	Неизвестные устроили стрельбу в центре Москвы	https://www.mskagency.ru/materials/3001509	https://iz.ru/1008984/2020-05-08/neizvestnye-ustroili-strelbu-v-t

# BertCause

## Training

In [None]:
import random
import torch
import numpy as np
import os

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(1337)

In [None]:
import csv

def read_records(file_name):
    records = []
    with open(file_name, "r") as r:
        reader = csv.reader(r, delimiter="\t")
        header = next(reader)
        for row in reader:
            r = dict(zip(header, row))
            result = r["result"]
            mapping = {
                "bad": 0,
                "rel": 0,
                "same": 0,
                "left_right_cause": 1,
                "left_right_cancel": 1,
                "right_left_cause": 2,
                "right_left_cancel": 2
            }
            if result not in mapping:
                continue
            r["label"] = mapping[result]
            records.append(r)
    return records

ru_records = read_records("ru_news_cause_v2.tsv")
en_records = read_records("en_news_cause_v0.tsv")

In [None]:
from collections import Counter

ru_labels_counter = Counter([r["label"] for r in ru_records])
print(ru_labels_counter)
en_labels_counter = Counter([r["label"] for r in en_records])
print(en_labels_counter)
labels_count = len(ru_labels_counter + en_labels_counter)

Counter({0: 3605, 2: 925, 1: 875})
Counter({0: 457, 1: 190, 2: 166})


In [None]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class NewsPairsDataset(Dataset):
    def __init__(self, records, tokenizer, max_tokens, labels_count):
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens
        self.records = records
        self.labels_count = labels_count
    
    def __len__(self):
        return len(self.records)
    
    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["left_title"],
            text_pair=record["right_title"],
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation="longest_first",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        label = record.get("label", None)
        if label is not None:
            output["labels"] = torch.tensor(label)
        return output

In [None]:
from collections import defaultdict

def split_with_source(records, val_border=0.8, test_border=0.9):
    records_by_source = defaultdict(list)
    for r in records:
        source = r["id"].split("_")[0]
        records_by_source[source].append(r)

    train_records, val_records, test_records = [], [], []
    for _, source_records in records_by_source.items():
        source_records.sort(key=lambda x: min(x["left_timestamp"], x["right_timestamp"]))
        val_border = int(0.8 * len(source_records))
        test_border = int(0.9 * len(source_records))
        train_records.extend(source_records[:val_border])
        val_records.extend(source_records[val_border:test_border])
        test_records.extend(source_records[test_border:])
    return train_records, val_records, test_records

ru_train_records, ru_val_records, ru_test_records = split_with_source(ru_records)
print("RU:")
print(len(ru_train_records))
print(len(ru_val_records))
print(len(ru_test_records))
print()

en_train_records, en_val_records, en_test_records = split_with_source(en_records)
print("EN:")
print(len(en_train_records))
print(len(en_val_records))
print(len(en_test_records))

RU:
4323
540
542

EN:
650
81
82


In [None]:
# MODEL_NAME = "DeepPavlov/rubert-base-cased"
MODEL_NAME = "xlm-roberta-large"
TOKENIZER_NAME = MODEL_NAME
MAX_TOKENS = 80

In [None]:
import random

train_records = ru_train_records + en_train_records
val_records = ru_val_records + en_val_records
random.shuffle(train_records)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, do_lower_case=False)
train_data = NewsPairsDataset(train_records, tokenizer, MAX_TOKENS, labels_count)
val_data = NewsPairsDataset(val_records, tokenizer, MAX_TOKENS, labels_count)

In [None]:
for item in train_data:
    print(item)
    break

{'input_ids': tensor([     0, 181599,   3737, 122387,  28832, 129334, 174056,  92890,  11981,
        211247,  82626,  20017,    419,     49,   8568,      2,      2, 119123,
         44308,  93958,     59,  45472,  63062,    476, 122387,  53871, 129334,
         60637,     89,  30462,   1993, 211247,  82626,  20017,     59,      2,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=labels_count)
model = model.to("cuda")

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

In [None]:
!rm -rf checkpoints

In [None]:
#@title Training params
EPOCHS = 6#@param {type:"number"}
EVAL_STEPS = 16#@param {type:"number"}
WARMUP_STEPS = 8#@param {type:"number"}
LR = 0.00003#@param {type:"number"}
BATCH_SIZE = 32#@param {type:"number"}
GRAD_ACCUM_STEPS = 4#@param {type:"number"}

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    report_to="none",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data
)

trainer.train()

***** Running training *****
  Num examples = 4973
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 234


Step,Training Loss,Validation Loss
16,0.9647,0.922024
32,0.8412,0.808545
48,0.6784,0.707206
64,0.6186,0.684942
80,0.5941,0.640631
96,0.5139,0.684243
112,0.4619,0.435086
128,0.2967,0.370078
144,0.2512,0.352107
160,0.2309,0.297958


***** Running Evaluation *****
  Num examples = 621
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-16
Configuration saved in checkpoints/checkpoint-16/config.json
Model weights saved in checkpoints/checkpoint-16/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 621
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-32
Configuration saved in checkpoints/checkpoint-32/config.json
Model weights saved in checkpoints/checkpoint-32/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 621
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-48
Configuration saved in checkpoints/checkpoint-48/config.json
Model weights saved in checkpoints/checkpoint-48/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 621
  Batch size = 32
Saving model checkpoint to checkpoints/checkpoint-64
Configuration saved in checkpoints/checkpoint-64/config.json
Model weights saved in checkpoints/checkpoint-64/pytorch_model.

TrainOutput(global_step=234, training_loss=0.412297681101367, metrics={'train_runtime': 1731.7585, 'train_samples_per_second': 17.23, 'train_steps_per_second': 0.135, 'total_flos': 4344853433261760.0, 'train_loss': 0.412297681101367, 'epoch': 6.0})

## Inference

In [None]:
from transformers import pipeline

model.eval()
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, framework="pt", device=0, return_all_scores=True)

In [None]:
import numpy as np

ru_y_true = np.array([r["label"] for r in ru_test_records], dtype=np.int32)
en_y_true = np.array([r["label"] for r in en_test_records], dtype=np.int32)
print(ru_y_true)
print(en_y_true)

[2 1 0 1 0 0 0 0 2 1 2 1 0 1 1 1 2 1 0 0 0 0 0 1 0 0 2 1 0 0 0 1 0 1 0 0 0
 0 0 0 0 0 0 2 0 0 1 0 0 0 0 0 0 2 1 0 1 1 0 2 0 0 0 0 0 0 0 0 0 1 0 0 1 0
 0 1 2 0 0 0 1 0 1 0 0 0 2 0 2 0 0 0 1 0 0 0 0 2 0 0 0 2 0 1 1 0 0 0 0 0 0
 2 0 1 1 0 1 1 2 0 0 1 0 0 0 0 1 1 0 0 0 2 0 1 0 2 0 0 0 2 0 2 2 1 0 1 0 0
 0 0 2 1 0 0 0 2 0 0 0 0 1 2 1 2 1 2 0 0 0 1 0 1 1 0 1 0 2 0 2 2 1 0 2 2 0
 0 0 0 1 1 2 0 0 0 2 0 1 1 0 1 2 2 0 1 0 2 0 1 2 1 1 1 2 0 2 2 0 0 0 0 0 0
 0 0 2 1 0 1 0 0 0 0 2 0 0 0 1 0 2 0 0 0 1 0 1 1 0 0 0 0 0 2 1 0 0 2 1 0 0
 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 2 2 2 1 1 2 0 0 0 0 0 1 0 0 1 1
 0 1 0 1 0 0 1 1 0 0 0 0 0 0 2 0 0 2 0 0 0 0 1 0 1 2 1 0 0 1 0 2 0 0 1 2 1
 1 0 0 0 1 1 0 2 2 1 2 1 1 1 0 1 1 0 0 0 1 2 1 2 0 1 2 0 2 2 0 2 0 0 2 0 2
 0 0 0 1 2 2 0 0 0 0 2 2 1 0 0 1 1 2 0 1 0 0 0 0 0 1 0 0 0 2 0 2 0 1 2 2 1
 2 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 2 1 0 2 0 1 2 2 0 0 2 0 0
 0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 2 1 1 2 0 0 1 2 0 2 2 0 1 0 0 0 1 1 0 2
 1 1 1 1 2 1 1 2 1 1 1 1 

In [None]:
def pipe_predict(data, batch_size=64):
    raw_preds = pipe(data, batch_size=batch_size)
    preds = np.array([int(max(labels, key=lambda x: x["score"])["label"][-1]) for labels in raw_preds])
    pp = np.array([[l["score"] for l in labels] for labels in raw_preds])
    return preds, pp

ru_test_pairs = [(r["left_title"], r["right_title"]) for r in ru_test_records]
en_test_pairs = [(r["left_title"], r["right_title"]) for r in en_test_records]
ru_y_pred = pipe_predict(ru_test_pairs)[0]
en_y_pred = pipe_predict(en_test_pairs)[0]
for p, r in zip(ru_y_pred, ru_test_records):
    r["prediction"] = p
for p, r in zip(en_y_pred, en_test_records):
    r["prediction"] = p
print(ru_y_pred)
print(en_y_pred)

[2 1 0 1 0 0 0 0 2 1 0 0 0 2 1 1 2 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 2 1 1 0 0 0 0 0 0 2 1 0 1 1 0 2 0 0 0 0 0 0 0 0 1 1 0 0 1 0
 1 1 2 0 0 0 1 0 1 0 2 0 0 1 2 0 0 0 1 2 0 0 0 2 0 0 0 0 0 1 1 0 1 0 1 0 0
 2 0 1 1 0 1 1 2 0 0 1 0 0 0 0 1 1 0 0 0 2 0 1 0 0 0 0 0 0 0 2 2 0 0 1 0 0
 0 0 2 1 0 0 0 2 0 1 0 0 1 0 1 2 1 2 0 0 0 0 0 1 1 0 1 0 2 0 2 0 1 0 2 2 0
 0 0 0 1 2 2 0 0 0 0 0 1 1 0 1 2 2 0 1 0 0 0 1 2 1 1 1 0 1 0 2 0 0 0 0 0 0
 0 0 2 1 0 1 0 0 2 0 2 0 0 0 1 0 2 0 0 0 1 0 1 1 0 0 0 0 0 2 1 0 0 2 1 0 0
 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2 2 1 1 2 0 0 0 0 2 1 0 0 1 1
 0 1 0 0 0 0 1 1 0 0 0 0 0 0 2 0 0 2 0 2 0 0 1 0 1 2 1 0 0 0 0 2 0 0 2 0 1
 1 0 0 0 1 0 0 2 2 1 2 1 1 0 0 1 1 0 0 0 0 2 0 2 0 0 2 0 2 2 0 2 0 0 2 0 2
 0 2 0 1 2 2 0 0 0 0 2 2 0 1 0 0 0 2 0 0 0 0 0 0 0 1 0 0 0 2 0 2 0 1 2 2 1
 2 0 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 2 1 0 2 0 1 2 2 0 0 2 0 0
 0 2 1 0 2 0 0 2 0 0 0 0 0 0 0 0 1 2 1 1 0 2 2 1 2 0 2 2 0 1 0 0 0 1 0 0 2
 1 1 1 1 2 1 1 2 1 1 1 0 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ru_y_true, ru_y_pred))
print(classification_report(en_y_true, en_y_pred))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       291
           1       0.91      0.85      0.88       148
           2       0.85      0.81      0.83       103

    accuracy                           0.88       542
   macro avg       0.88      0.86      0.87       542
weighted avg       0.88      0.88      0.88       542

              precision    recall  f1-score   support

           0       0.82      0.91      0.86        44
           1       0.70      0.70      0.70        20
           2       0.92      0.67      0.77        18

    accuracy                           0.80        82
   macro avg       0.81      0.76      0.78        82
weighted avg       0.81      0.80      0.80        82



## Interpretation

### Errors

In [None]:
for i, r in enumerate(ru_test_records):
    mapping = {
        0: "not_cause",
        1: "left_right",
        2: "right_left"
    }
    if ru_y_true[i] != ru_y_pred[i]:
        print("LEFT:", r["left_title"])
        print("RIGHT:", r["right_title"])
        print("LABELS: true:{}, pred:{}".format(mapping[ru_y_true[i]], mapping[ru_y_pred[i]]))
        print()

LEFT: В Бурятии не подтвердился случай повторного заболевания коронавирусом
RIGHT: Жительница Бурятии, возможно, повторно заболела коронавирусом
LABELS: true:right_left, pred:not_cause

LEFT: Жительница Бурятии, возможно, повторно заболела коронавирусом
RIGHT: В Бурятии не подтвердился случай повторного заболевания коронавирусом
LABELS: true:left_right, pred:not_cause

LEFT: Уфимцы могут помочь с поисками пропавшего мужчины
RIGHT: В Уфе нашли живым 35-летнего Станислава Суркова
LABELS: true:left_right, pred:right_left

LEFT: Портников: Зеленский должен объяснить, почему снял санкции с внучки Муссолини, поддерживающей оккупацию Донбасса
RIGHT: Зеленский снял санкции с наблюдателей на «выборах ЛДНР». Среди них внучка диктатора Муссолини
LABELS: true:right_left, pred:not_cause

LEFT: Главврач больницы в Гудермесе сменился после протеста врачей
RIGHT: Даудов оправдал протест сотрудников Гудермесской больницы ошибками главврача
LABELS: true:not_cause, pred:left_right

LEFT: Украинские поляр

### BertViz

In [35]:
# from bertviz import head_view
# with torch.no_grad():
#     for r in ru_test_records:
#         if r["label"] == 2:
#             inputs = tokenizer.encode_plus(r["left_title"], r["right_title"], return_tensors='pt', add_special_tokens=True)
#             input_ids = inputs["input_ids"].cuda()
#             outputs = model(input_ids, return_dict=True, output_attentions=True)
#             attention = outputs.attentions
#             tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
#             head_view(attention, tokens)
#             break

### Checklist

In [None]:
%%writefile ru_lexicons.json
{
    "lexicons": {
        "famous_male_last_name": ["Путин", "Песков", "Меладзе", "Мясников", "Макрон", "Порошенко", "Зеленский", "Медведев", "Алибасов", "Трамп", "Байден"],
        "location_city": ["в Москве", "в Самаре", "в Париже", "в Дзержинске", "во Владимире", "в Стамбуле", "в Санкт-Петербурге", "в Сочи", "в Чикаго", "в Косово", "в Токио"],
        "location_country": ["в России", "во Франции", "в США", "в Казахстане", "в Японии", "в Германии", "в Китае", "в Украине", "в Великобритании", "в Испании", "в РФ"],
        "past_male_tell_verb": ["сообщил", "рассказал", "заявил"],
        "future_male_tell_verb": ["сообщит", "расскажет", "заявит"],
        "present_male_refute_verb": ["опроверг", "отрицает"],
        "local_bad_event_gent": ["пожара", "взрыва", "ДТП", "аварии", "задержания террористов"],
        "local_bad_event_loct": ["пожаре", "взрыве", "ДТП", "аварии", "задержании террористов"],
        "global_bad_event_loct": ["вводе военного положения", "подорожании продуктов"],
        "bad_event_loct": ["пожаре", "взрыве", "ДТП", "аварии", "задержании террористов", "вводе военного положения", "подорожании продуктов"],
        "bad_reason": ["из-за коронавируса", "из-за гриппа", "из-за погоды", "из-за проблем", "из-за войны", "из-за жары", "из-за болезни", "из-за Путина", "из-за Китая"],
        "regulation": ["карантин", "комендантский час", "запрет"],
        "regulation_loct": ["карантине", "комендантском часе", "запрете"],
        "date_future_duration": ["до 2023 года", "до 1 марта 2026 года", "до 31 декабря"],
        "date_future_year": ["в 2021 году", "в следующем году"],
        "date_future": ["до 2023 года", "до 1 марта 2026 года", "до 31 декабря", "в 2021 году", "в следующем году"]
    }
}

Writing ru_lexicons.json


In [None]:
from checklist.editor import Editor
from checklist.test_types import MFT, INV, DIR
from checklist.test_suite import TestSuite
from checklist.perturb import Perturb
from checklist.expect import Expect

def pair_capitalize(template):
    new_data = []
    for left, right in template.data:
        new_data.append((left[0].upper() + left[1:], right[0].upper() + right[1:]))
    template.data = new_data
    return template

editor = Editor(language="russian", model_name="xlm-roberta-large")
with open("ru_lexicons.json", "r") as r:
    lexicons = json.load(r)["lexicons"]
for key, words in lexicons.items():
    editor.add_lexicon(key, words)
    editor.add_lexicon(key + "_capitalize", [s[0].upper() + s[1:] for s in words])
suite = TestSuite()

#### Robustness

In [None]:
suite.add(MFT(
    **pair_capitalize(editor.template(
        (
            "{location_city} {future_introduce} {regulation}",
            "{expert}: {location_city} {future_introduce} {regulation}"
        ),
        expert=("эксперт", "власти", "кремль"),
        future_introduce=("введут", "установят"),
        remove_duplicates=True,
        nsamples=200,
    )), labels=0,
    name="Robustness to 'expert' and 'governement' additions",
    capability="Robustness",
    description="'Expert:' or 'Government:' in the beginning should not change model outputs"
))

#### Temporal understanding

In [None]:
suite.add(MFT(
    **pair_capitalize(editor.template(
        (
            "В работе операторов начались сбои {bad_reason}",
            "{famous_male_last_name} опроверг данные о сбоях в работе операторов {bad_reason}"
        ),
        remove_duplicates=True,
        nsamples=200,
    )), labels=1,
    name="Explicit refutations: person names and bad reasons",
    capability="Temporal understanding",
    description="The cause should not be changed by different persons or reasons"
))

suite.add(MFT(
    **pair_capitalize(editor.template(
        (
            "В результате стрельбы {location_city} никто не пострадал",
            "Неизвестные устроили стрельбу {location_city}"
        ),
        remove_duplicates=True,
        nsamples=200,
    )), labels=2,
    name="Implicit refutations: locations",
    capability="Temporal understanding",
    description="The cause should not be changed by different locations"
))

suite.add(MFT(
    **pair_capitalize(editor.template(
        (
            "{famous_male_last_name1} {present_male_refute_verb} {news} о {bad_event_loct} {location_city}",
            "{famous_male_last_name1} {past_male_tell_verb} о {bad_event_loct} {location_city}"
        ),
        news=("информацию", "новость", "сообщение"),
        remove_duplicates=True,
        nsamples=200,
    )), labels=2,
    name="Explicit refutations: same person",
    capability="Temporal understanding",
    description="The same person event refutation"
))

suite.add(MFT(
    **pair_capitalize(editor.template(
        (
            "{location_city} {past_introduce} {regulation}",
            "{regulation} {location_city} перестал действовать"
        ),
        past_introduce=("ввели", "установили"),
        remove_duplicates=True,
        nsamples=200,
    )), labels=1,
    name="Explicit refutations: impersonal past verb",
    capability="Temporal understanding",
    description="Impersonal past verb refutation"
))

suite.add(MFT(
    **pair_capitalize(editor.template(
        (
            "{location_city} {future_introduce} {regulation}",
            "{regulation} {location_city} перестал действовать"
        ),
        future_introduce=("введут", "установят"),
        remove_duplicates=True,
        nsamples=200,
    )), labels=1,
    name="Explicit refutations: impersonal future verb",
    capability="Temporal understanding",
    description="Impersonal future verb refutation"
))

suite.add(MFT(
    **pair_capitalize(editor.template(
        (
            "{location_city1} {past_introduce} {regulation}",
            "{regulation} {location_city2} перестал действовать"
        ),
        past_introduce=("ввели", "установили"),
        remove_duplicates=True,
        nsamples=200,
    )), labels=0,
    name="Explicit refutations: different locations",
    capability="Temporal understanding",
    description="Bad refutation: different locations"
))

#### Logic

In [None]:
test_zeros = [(r["left_title"], r["right_title"]) for r in ru_test_records if r["prediction"] == 0]
test_directional = [(r["left_title"], r["right_title"]) for r in ru_test_records if r["prediction"] in (1, 2)]

def change_order(x, *args, **kwargs):
    left, right = x
    return (right, left)

suite.add(INV(
    **Perturb.perturb(test_zeros, change_order, keep_original=True),
    name="Zero invariant to swap order",
    capability="Logic",
    description=""
))


def changed_pred_nonzero(orig_pred, pred, orig_conf, conf, labels=None, meta=None):
    return pred != orig_pred and pred != 0 and orig_pred != 0

suite.add(DIR(
    **Perturb.perturb(test_directional, change_order, keep_original=True),
    name="Directional change invariant",
    capability="Logic",
    description="",
    expect=Expect.pairwise(changed_pred_nonzero)
))

#### Run

In [None]:
suite.run(pipe_predict, overwrite=True)

Running Robustness to 'expert' and 'governement' additions
Predicting 200 examples
Running Explicit refutations: person names and bad reasons
Predicting 200 examples
Running Implicit refutations: locations
Predicting 200 examples
Running Explicit refutations: same person
Predicting 200 examples
Running Explicit refutations: impersonal past verb
Predicting 200 examples
Running Explicit refutations: impersonal future verb
Predicting 200 examples
Running Explicit refutations: different locations
Predicting 180 examples
Running Zero invariant to swap order
Predicting 610 examples
Running Directional change invariant
Predicting 474 examples


In [34]:
suite.summary()

Robustness

Robustness to 'expert' and 'governement' additions
Test cases:      200
Fails (rate):    0 (0.0%)




Logic

Zero invariant to swap order
Test cases:      305
Fails (rate):    18 (5.9%)

Example fails:
1.0 0.0 0.0 ('Дефицит цветов в России опровергли', 'России предсказали подорожание цветов из-за Белоруссии')
0.2 0.7 0.0 ('России предсказали подорожание цветов из-за Белоруссии', 'Дефицит цветов в России опровергли')

----
0.5 0.0 0.5 ('"Полиция убедила разойтись мигрантов, собравшихся около рынка ""Фуд Сити"""', '"Торговцы устроили акцию протеста у московского рынка ""Фуд Сити"" на Калужском шоссе"')
0.3 0.6 0.1 ('"Торговцы устроили акцию протеста у московского рынка ""Фуд Сити"" на Калужском шоссе"', '"Полиция убедила разойтись мигрантов, собравшихся около рынка ""Фуд Сити"""')

----
0.8 0.0 0.2 ('Отказ Ирана от встречи по ядерной сделке разочаровал Белый дом', 'WSJ узнал об отказе Ирана от прямых переговоров с ЕС и США по ядерной сделке')
0.1 0.9 0.0 ('WSJ узнал об отказе

# Saving

In [None]:
import shutil
import os

OUT_DIR = "ru_bert_cause"
if os.path.isdir(OUT_DIR):
    shutil.rmtree(OUT_DIR) 
model.save_pretrained(OUT_DIR)
train_data.tokenizer.save_pretrained(OUT_DIR)

In [None]:
!cd ru_bert_cause && tar -czvf ru_bert_cause.tar.gz .

# Telegram pairs inference

In [None]:
!wget https://www.dropbox.com/s/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz
!tar -xzvf tg_pairs.jsonl.tar.gz

--2021-07-31 14:38:07--  https://www.dropbox.com/s/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz
Resolving www.dropbox.com (www.dropbox.com)... 162.125.82.18, 2620:100:6032:18::a27d:5212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.82.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz [following]
--2021-07-31 14:38:08--  https://www.dropbox.com/s/raw/u1f8zjgyuwvh4rr/tg_pairs.jsonl.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca9926be17b5b6caeab18871b26.dl.dropboxusercontent.com/cd/0/inline/BTVRc4UBpZTDpuk-v9WeTTsb-qnmwz53q6UUug3BGcsBRfgebqo6qWT6tSnhg2_06jCnTM1Mfi_nggaEHS8gfDGBBn3ajasQ5uVQ3aHhZys1TTwR1fmjdIFzX4SCYcJT7EIWzrWQXGodiph5nGaOaG_N/file# [following]
--2021-07-31 14:38:08--  https://uca9926be17b5b6caeab18871b26.dl.dropboxusercontent.com/cd/0/inline/BTVRc4UBpZTDpuk-v9WeTTsb-qnmwz53q6UUug3BGcsBRfgebqo6qWT6tSnh

In [None]:
!head -n 1 tg_pairs.jsonl

{"from_language": "en", "to_language": "en", "from_timestamp": 1587934800, "to_timestamp": 1587934800, "from_title": "Government Calls for Return of Premier League as Soon as Possible to Boost National Spirit", "to_title": "Premier League 'Project Restart': When Could the 19/20 Season Restart & Finish?", "from_url": "https://www.90min.com/posts/government-calls-for-return-of-premier-league-as-soon-as-possible-to-boost-national-spirit-01e6yc3w8ptr", "to_url": "https://www.90min.com/posts/premier-league-project-restart-when-could-the-19-20-season-restart-finish-01e6xwh1gp8v", "distance": 0.23587880211712153, "id": 2}


In [None]:
tg_records = []
with open("tg_pairs.jsonl", "r") as r:
    for line in r:
        r = json.loads(line)
        if not (r["from_language"] == r["to_language"] == "ru"):
            continue
        r["left_title"] = r.pop("from_title")
        r["right_title"] = r.pop("to_title")
        r["left_url"] = r.pop("from_url")
        r["right_url"] = r.pop("to_url")
        tg_records.append(r)

In [None]:
from tqdm.notebook import tqdm
from collections import Counter

tg_pairs_data = NewsPairsDataset(tg_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)

tg_labels = []
with torch.no_grad(): 
    for item in tqdm(tg_pairs_data):
        for key, value in item.items():
            item[key] = value.unsqueeze_(0).cuda()
        outputs = model(**item, return_dict=True)
        logits = outputs.logits.squeeze(0)
        label = torch.argmax(logits).item()
        prob = torch.sigmoid(logits[label]).item()
        tg_labels.append((label, prob))

labels_cntr = Counter()
for (label, prob), r in zip(tg_labels, tg_records):
    r["bert_label"] = label
    labels_cntr[label] += 1
    r["bert_confidence"] = prob

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_versi

HBox(children=(FloatProgress(value=0.0, max=55535.0), HTML(value='')))




In [None]:
print(labels_cntr.most_common())

[(0, 51149), (2, 3505), (1, 881)]


In [None]:
with open("ru_tg_pairs_with_bert.jsonl", "w") as w:
    for r in tg_records:
        w.write(json.dumps(r, ensure_ascii=False).strip() + "\n")

In [None]:
!rm -rf ru_tg_pairs_with_bert.jsonl.tar.gz
!tar -czvf ru_tg_pairs_with_bert.jsonl.tar.gz ru_tg_pairs_with_bert.jsonl

ru_tg_pairs_with_bert.jsonl


# Single-sentence

In [None]:
singles = []
for r in records:
    label = r["label"]
    if result == 'left_right_cause':
        singles.append({'id':r['id']+'_l', 'title':r['left_title'], 'timestamp':r['left_timestamp'], 'label':0 })
        singles.append({'id':r['id']+'_r', 'title':r['right_title'], 'timestamp':r['right_timestamp'], 'label':1 })
    elif result == 'right_left_cause':
        singles.append({'id':r['id']+'_l', 'title':r['left_title'], 'timestamp':r['left_timestamp'], 'label':1 })
        singles.append({'id':r['id']+'_r', 'title':r['right_title'], 'timestamp':r['right_timestamp'], 'label':0 })
print(len(singles))

In [None]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class NewsSinglesDataset(Dataset):
    def __init__(self, records, max_tokens, model_name, labels_count):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=False
        )
        self.max_tokens = max_tokens
        self.records = records
        self.labels_count = labels_count
    
    def __len__(self):
        return len(self.records)
    
    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["title"],
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation="longest_first",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        label = record.get("label", None)
        if label is not None:
            output["labels"] = torch.tensor(label)
        return output