# Requirements

In [1]:
# !pip install --upgrade transformers bertviz checklist

# Data loading

In [2]:
# !rm -rf ru_news_cause_v1.tsv*
# !wget https://www.dropbox.com/s/kcxnhjzfut4guut/ru_news_cause_v1.tsv.tar.gz
# !tar -xzvf ru_news_cause_v1.tsv.tar.gz

In [3]:
# !cat ru_news_cause_v1.tsv | wc -l
# !head ru_news_cause_v1.tsv

# BertCause

## Training

In [1]:
import csv

records = []
with open("ru_news_cause_v1.tsv", "r", encoding='utf-8') as r:
    reader = csv.reader(r, delimiter="\t")
    header = next(reader)
    for row in reader:
        r = dict(zip(header, row))
        if float(r["confidence"]) < 0.69:
            continue
        result = r["result"]
        mapping = {
#             "bad": 0,
#             "rel": 0,
#             "same": 0,
            "left_right_cause": 0,
            "left_right_cancel": 1,
            "right_left_cause": 0,
            "right_left_cancel": 1
        }
        if result not in mapping:
            continue
        if result.startswith('right'):
            r['left_title'], r['right_title'] = r['right_title'], r['left_title']
        r["label"] = mapping[result]
        records.append(r)

In [2]:
from collections import Counter

labels_counter = Counter([r["label"] for r in records])
labels_count = len(labels_counter)
labels_counter

Counter({1: 162, 0: 1307})

In [3]:
import numpy as np
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from collections import Counter, defaultdict

class NewsPairsDataset(Dataset):
    def __init__(self, records, max_tokens, model_name, labels_count):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=False
        )
        self.max_tokens = max_tokens
        self.records = records
        labels_counter = Counter([r["label"] for r in self.records])
        min_class = min(labels_counter.values())
        new_labels_counter = defaultdict(int)
        new_records = []
        np.random.shuffle(self.records)
        for r in self.records:
            if new_labels_counter[r['label']] < min_class:
                new_labels_counter[r['label']] += 1
                new_records.append(r)
        self.records = new_records
        print(Counter([r["label"] for r in self.records]))
        self.labels_count = labels_count
    
    def __len__(self):
        return len(self.records)
    
    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["left_title"],
            text_pair=record["right_title"],
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation="longest_first",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        label = record.get("label", None)
        if label is not None:
            output["labels"] = torch.tensor(label)
        return output

2021-08-01 15:06:53.557829: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [5]:
import random
from collections import defaultdict

records_by_source = defaultdict(list)
for r in records:
    source = r["id"].split("_")[0]
    records_by_source[source].append(r)

train_records, val_records, test_records = [], [], []
for _, source_records in records_by_source.items():
    source_records.sort(key=lambda x: min(x["left_timestamp"], x["right_timestamp"]))
    val_border = int(0.8 * len(source_records))
    test_border = int(0.9 * len(source_records))
    train_records.extend(source_records[:val_border])
    val_records.extend(source_records[val_border:test_border])
    test_records.extend(source_records[test_border:])

print(len(train_records))
print(len(val_records))
print(len(test_records))

1174
147
148


In [6]:
MODEL_NAME = "DeepPavlov/rubert-base-cased"
TOKENIZER_NAME = MODEL_NAME
MAX_TOKENS = 80

In [7]:
from torch.utils.data import DataLoader, RandomSampler

train_data = NewsPairsDataset(train_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)
val_data = NewsPairsDataset(val_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)

Counter({0: 128, 1: 128})
Counter({0: 15, 1: 15})


In [8]:
for item in train_data:
    print(item)
    break

{'input_ids': tensor([  101, 70852, 20306, 50502, 45065,  2237,   612, 42630,  3005,  7897,
          842,   845,  8349,   102, 23269, 82900, 45065,  2237,   612, 42630,
         3005,  7897,   842,   845,  8349,  1703,   130,  1758, 31149, 75325,
         7957,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=labels_count)
model = model.to("cuda")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
!rm -rf checkpoints

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
from transformers import Trainer, TrainingArguments

EPOCHS = 6
EVAL_STEPS = 10 #*8
WARMUP_STEPS = 5 # *8
LR = 3e-05
BATCH_SIZE = 128//8 # 
GRAD_ACCUM_STEPS = 1*8

EPOCHS = 30
EVAL_STEPS = 5 #*8


training_args = TrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
#     report_to="none",
    report_to="wandb",  # enable logging to W&B
    run_name="newscausation_cancel_balanced",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data
)

trainer.train()

using `logging_steps` to initialize `eval_steps` to 5
PyTorch: setting up devices
***** Running training *****
  Num examples = 256
  Num Epochs = 30
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 8
  Total optimization steps = 60
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
5,0.5328,0.460848
10,0.3345,0.279271
15,0.2063,0.200141
20,0.1547,0.138652
25,0.1288,0.37579
30,0.1089,0.35493
35,0.1013,0.198353
40,0.0853,0.113289
45,0.0735,0.088369
50,0.0625,0.069518


***** Running Evaluation *****
  Num examples = 30
  Batch size = 16
Saving model checkpoint to checkpoints/checkpoint-5
Configuration saved in checkpoints/checkpoint-5/config.json
Model weights saved in checkpoints/checkpoint-5/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30
  Batch size = 16
Saving model checkpoint to checkpoints/checkpoint-10
Configuration saved in checkpoints/checkpoint-10/config.json
Model weights saved in checkpoints/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30
  Batch size = 16
Saving model checkpoint to checkpoints/checkpoint-15
Configuration saved in checkpoints/checkpoint-15/config.json
Model weights saved in checkpoints/checkpoint-15/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 30
  Batch size = 16
Saving model checkpoint to checkpoints/checkpoint-20
Configuration saved in checkpoints/checkpoint-20/config.json
Model weights saved in checkpoints/checkpoint-20/pytorch_model.bin
***

TrainOutput(global_step=60, training_loss=0.15522093524535496, metrics={'train_runtime': 242.0035, 'train_samples_per_second': 31.735, 'train_steps_per_second': 0.248, 'total_flos': 315733266432000.0, 'train_loss': 0.15522093524535496, 'epoch': 30.0})

## Inference

In [14]:
import numpy as np

test_data = NewsPairsDataset(test_records, MAX_TOKENS, TOKENIZER_NAME, labels_count)
y_true = [item["labels"].item() for item in test_data]
y_true = np.array(y_true, dtype=np.int32)
print(y_true)

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transforme

Counter({0: 19, 1: 19})
[0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1]


In [15]:
y_pred = []
with torch.no_grad():
    for item in test_data:
        for key, value in item.items():
            item[key] = value.unsqueeze_(0).cuda()
        outputs = model(**item, return_dict=True)
        pred = torch.argmax(outputs.logits).item()
        y_pred.append(pred)
y_pred = np.array(y_pred)
print(y_pred)

[0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1]


In [16]:
from sklearn.metrics import classification_report, balanced_accuracy_score, confusion_matrix
print(classification_report(y_true, y_pred))
print('balanced_accuracy_score', balanced_accuracy_score(y_true, y_pred))
print('\nconfusion_matrix\n',confusion_matrix(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90        19
           1       0.94      0.84      0.89        19

    accuracy                           0.89        38
   macro avg       0.90      0.89      0.89        38
weighted avg       0.90      0.89      0.89        38

balanced_accuracy_score 0.894736842105263

confusion_matrix
 [[18  1]
 [ 3 16]]


## Interpretation

### Errors

In [17]:
for i, r in enumerate(test_records):
    mapping = {
        0: "cause",
        1: "cancel",
#         2: "right_left"
    }
    if y_true[i] != y_pred[i]:
        print("LEFT:", r["left_title"])
        print("RIGHT:", r["right_title"])
        print("LABELS: true:{}, pred:{}".format(mapping[y_true[i]], mapping[y_pred[i]]))
        print()

LEFT: Первый канал изменит формат вечернего шоу из-за болезни Урганта
RIGHT: Ургант вернулся на ТВ и рассказал о последствиях коронавируса
LABELS: true:cancel, pred:cause

LEFT: Россиянин Немков защитил титул чемпиона Bellator
RIGHT: Россиянин Немков оказался в больнице после защиты чемпионского титула Bellator
LABELS: true:cause, pred:cancel

LEFT: Самая дешевая рыба в России станет дороже
RIGHT: Россиян попросили не верить сообщениям о росте цен на самую дешевую рыбу
LABELS: true:cancel, pred:cause

LEFT: Тарасова назвала Роднину дурой
RIGHT: Роднина ответила назвавшей ее дурой Тарасовой
LABELS: true:cancel, pred:cause



IndexError: index 38 is out of bounds for axis 0 with size 38