# Requirements

In [1]:
# !pip install --upgrade transformers bertviz checklist

# Data loading

In [2]:
# !rm -rf ru_news_cause_v1.tsv*
# !wget https://www.dropbox.com/s/kcxnhjzfut4guut/ru_news_cause_v1.tsv.tar.gz
# !tar -xzvf ru_news_cause_v1.tsv.tar.gz

In [3]:
# !cat ru_news_cause_v1.tsv | wc -l
# !head ru_news_cause_v1.tsv

# GPT-gen

In [1]:
from transformers import AutoModelWithLMHead, AutoTokenizer
device = 'cuda'
model_id = 'sberbank-ai/rugpt3small_based_on_gpt2' 
cache_dir = '/media/altsoph/Volume/_transformers_cache/'
 
model = AutoModelWithLMHead.from_pretrained(model_id, cache_dir=cache_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

2021-08-08 18:50:40.629383: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import csv

records = []
with open("ru_news_cause_v1.tsv", "r", encoding='utf-8') as r:
    reader = csv.reader(r, delimiter="\t")
    header = next(reader)
    for row in reader:
        r = dict(zip(header, row))
        if float(r["confidence"]) < 0.69:
            continue
        result = r["result"]
        mapping = {
            "left_right_cause": 0,
            "left_right_cancel": 1,
            "right_left_cause": 0,
            "right_left_cancel": 1
        }
        if result not in mapping:
            continue
        if result.startswith('right'):
            r['left_title'], r['right_title'] = r['right_title'], r['left_title']
        r["label"] = mapping[result]
        records.append(r)

In [3]:
# we need to set a padding token
tokenizer.vocab['<|endoftext|>']
tokenizer.pad_token = tokenizer.eos_token

In [4]:
import torch
from torch.utils.data import Dataset

class LineByLineTextDataset(Dataset):
    def __init__(self, records, max_tokens, tokenizer):
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens
        self.records = records

    def __len__(self):
        return len(self.records)

    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["left_title"]+'. '+record["right_title"]+'. <|endoftext|>',
            add_special_tokens=True,
            max_length=self.max_tokens,
            truncation="longest_first",
            padding="max_length",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        return output


In [5]:
import random
from collections import defaultdict

records_by_source = defaultdict(list)
for r in records:
    source = r["id"].split("_")[0]
    records_by_source[source].append(r)

train_records, val_records, test_records = [], [], []
for _, source_records in records_by_source.items():
    source_records.sort(key=lambda x: min(x["left_timestamp"], x["right_timestamp"]))
    val_border = int(0.8 * len(source_records))
    test_border = int(0.9 * len(source_records))
    train_records.extend(source_records[:val_border])
    val_records.extend(source_records[val_border:test_border])
    test_records.extend(source_records[test_border:])

print(len(train_records))
print(len(val_records))
print(len(test_records))

1174
147
148


In [6]:
from torch.utils.data import DataLoader, RandomSampler

MAX_TOKENS = 80

train_data = LineByLineTextDataset(train_records, MAX_TOKENS, tokenizer)
val_data = LineByLineTextDataset(val_records, MAX_TOKENS, tokenizer)

In [7]:
for item in train_data:
    print(item)
    break

{'input_ids': tensor([ 4408,  1865,   701, 48450,   783,   281, 29075, 30025, 29111,   354,
        20886, 46404, 21291,  9621,  2198, 38639, 46792,    18, 38466, 20886,
         4929, 35447,   334,  5083, 48450,   553,    16,   282,  1754, 13731,
        30025,   505, 21291,  9621,  2198, 38639, 46792,    18,   225, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])}


In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling 

EPOCHS = 6
EVAL_STEPS = 10 #*8
WARMUP_STEPS = 5 # *8
LR = 3e-05
BATCH_SIZE = 128//8 # 
GRAD_ACCUM_STEPS = 1*8

training_args = TrainingArguments(
    output_dir="./gpt2-gen1", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    report_to="none",
#     report_to="wandb",  # enable logging to W&B
#     run_name="newscausation_gptgen",
    prediction_loss_only=True,
    load_best_model_at_end=True
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,    
    train_dataset=train_data,
    eval_dataset=val_data
)

trainer.train()


***** Running training *****
  Num examples = 1174
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 8
  Total optimization steps = 54


Step,Training Loss,Validation Loss
10,3.4196,2.831861
20,2.8191,2.671859
30,2.5671,2.621898
40,2.4347,2.596229
50,2.3567,2.593441


***** Running Evaluation *****
  Num examples = 147
  Batch size = 16
Saving model checkpoint to ./gpt2-gen1/checkpoint-10
Configuration saved in ./gpt2-gen1/checkpoint-10/config.json
Model weights saved in ./gpt2-gen1/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 147
  Batch size = 16
Saving model checkpoint to ./gpt2-gen1/checkpoint-20
Configuration saved in ./gpt2-gen1/checkpoint-20/config.json
Model weights saved in ./gpt2-gen1/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 147
  Batch size = 16
Saving model checkpoint to ./gpt2-gen1/checkpoint-30
Configuration saved in ./gpt2-gen1/checkpoint-30/config.json
Model weights saved in ./gpt2-gen1/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 147
  Batch size = 16
Saving model checkpoint to ./gpt2-gen1/checkpoint-40
Configuration saved in ./gpt2-gen1/checkpoint-40/config.json
Model weights saved in ./gpt2-gen1/checkpoint-40/pytorch_model.

TrainOutput(global_step=54, training_loss=2.6876883153562194, metrics={'train_runtime': 97.7637, 'train_samples_per_second': 72.051, 'train_steps_per_second': 0.552, 'total_flos': 422100675624960.0, 'train_loss': 2.6876883153562194, 'epoch': 5.97})

In [47]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50264, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [9]:
import numpy as np

np.random.seed(1337)

new_titles = []
with open("titles.txt", "r", encoding='utf-8') as r:
    for line in r:
        new_titles.append( line.strip() )
print(len(new_titles))

np.random.shuffle(new_titles)

118372


In [31]:
from transformers import pipeline
# gpt2-gen1/checkpoint-50
# chef = pipeline('text-generation',model='./gpt2-gerchef', tokenizer='anonymous-german-nlp/german-gpt2',config={'max_length':800})

gen = pipeline('text-generation',model=model, return_full_text=False,
                tokenizer=tokenizer,config={'max_length':MAX_TOKENS*2},device=0)

for item in new_titles[:30]:
    print(item)
    print(gen(item+'.'))
    print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Сотрудники ФСБ поймали заработавшую на иностранцах 10 миллионов рублей ОПГ
[{'generated_text': ' В России объяснили задержа'}]

Бывшую жену Грачевского обвинили в ранней смерти режиссера


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' Грачевский отреагировал на обвинения в ранней смерти жены'}]

В московском роддоме из пациенток «выдавливали» детей
[{'generated_text': ' Родственники пациенток'}]

Россиянам разъяснили новые правила оформления выплат на детей
[{'generated_text': ' В Госдуме прокомментировали нововведение о выплатах'}]

Россиянин похитил миллиард рублей и поплатился


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' Раскрыты подробности похищения россиянином миллиарда рублей'}]

Смертоносная кобра спряталась в кресло и напугала севшего в него мужчину
[{'generated_text': ' '}]

Описано новое осложнение при коронавирусе
[{'generated_text': ' Появились подробности о новом осложнении при корона'}]

США пошли на экстренные меры из-за коронавируса


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' В России отреагировали на отмену'}]

В российском центре по борьбе с алкоголизмом пациентов били и держали на цепи
[{'generated_text': ' В российском центре по борьбе'}]

Раскрыты правила проживания в одной квартире с зараженным коронавирусом
[{'generated_text': ' В России объяснили правила проживания'}]

Дизайнер снял в рекламе свою бабушку в пиджаке на голое тело
[{'generated_text': ' Раскрыты подробности'}]

Роспотребнадзор сообщил о «замедлении снижения» заболеваемости COVID-19 в России


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 22, but ``max_length`` is set to 20.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 20, but ``max_length`` is set to 20.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' '}]

Глава белорусских католиков назвал изгнание из страны своим крестом
[{'generated_text': ' В Беларуси отреагировали на изгнание из'}]

Программа «Архитекторы.рф» запустила новый курс по урбанистике
[{'generated_text': ' '}]

«Ужасную» квартиру Ефремова оценили в 40 миллионов рублей


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' В Госдуме оценили квартиру Ефремова'}]

Посол России подарил Лукашенко карту Белоруссии в Российской империи
[{'generated_text': ' Лукашенко ответил на подарок Белоруссии в России. '}]

Россиян задумали избавить от страховых платежей по ипотеке


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 21, but ``max_length`` is set to 20.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 20, but ``max_length`` is set to 20.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' В России оценили идею отказаться от выплат'}]

ПСБ профинансирует социально значимые проекты Краснодарского края
[{'generated_text': ' В Краснодарском крае оценили идею профи'}]

Полицейские отобрали деньги у сына главреда Cosmopolitan со словами «Стой, сука»
[{'generated_text': ' В'}]

206-килограммовая женщина похудела на 138 килограммов и раскрыла секрет успеха
[{'generated_text': ' Ра'}]

Китайским строителям в России подыскали замену


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' В России оценили слова Трампа о «сверст'}]

Путин раскрыл суть изменений в Конституции
[{'generated_text': ' Путин объяснил слова о «перестройке» в Конституции. '}]

Российская учительница исполнила эротический танец в клубе и уволилась из школы


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' В школе объяснили увольнение'}]

Россиян предупредили о погодных аномалиях в течение недели
[{'generated_text': ' В России объяснили опасения россиян о погодных аном'}]

На поддержку рыбохозяйств в Подмосковье выделено 22 миллиона рублей


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' В Подмосковье оценили помощь рыбохозяй'}]

В Раде спрогнозировали будущее Украины на фоне коронавируса
[{'generated_text': ' Раскрыты подробности'}]

Американцам предложат включить в рацион мясо питонов
[{'generated_text': ' В России отреагировали на предложение включить в рацион'}]

Подсчитан заработок торгующих на бирже россиян


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': ' В России оценили результаты подсчета заработка торгующих'}]

KFC откроет новый вид ресторанов в России
[{'generated_text': ' В России объяснили закрытие ресторанов в России.  '}]

Минздрав России одобрил еще один препарат от коронавируса
[{'generated_text': ' В Минздраве отреагировали на предложение'}]



## ^ standard pipeline performs pourly, ending sentences too fast

## let's play with some inference parameters and make a lot of candidates to select from

In [45]:
import transformers
transformers.logging.set_verbosity_error()

def sample(model, tokenizer, prefix, n):
    prefix_tokens = torch.LongTensor(tokenizer.encode(prefix+'.')).to(device)[None]
    preds = model.generate(
        prefix_tokens,
#         num_beams=5, 
#         no_repeat_ngram_size=2, 
#         early_stopping=False
#         max_length=50, 
        num_return_sequences=n, 
        do_sample=True, 
        top_k=0,
        temperature=0.7,
        top_p=0.92,
    )
    return [tokenizer.decode(preds[r].cpu().numpy()) for r in range(n)]

def simple_filter(items):
    return list(set([item.split('.')[1]+'.' for item in items if item.count('.')>1 and item.split('.')[1].count(' ')>4]))

for item in new_titles[:30]:
    if '.' in item: continue
    for _ in range(10):
        res = simple_filter(sample(model, tokenizer, item, 64))
        if res:
            print(item)
            for r in res:
                print(f'    => {r}')
            print()
            break


Бывшую жену Грачевского обвинили в ранней смерти режиссера
    =>  Жена режиссера рассказала о своей скорой смерти.
    =>  Грачевский ответил на обвинения в ранней смерти.
    =>  Раскрыты подробности смерти жены Грачевского.
    =>  Раскрыты подробности смерти режиссера Грачевского.
    =>  Раскрыта личность бывшего мужа Грачевского.
    =>  Раскрыта личность бывшей жены Грачевского.
    =>  Грачевский опроверг обвинения в ранней смерти режиссера.
    =>  В России назвали причину смерти режиссера Грачевского.
    =>  Раскрыто имя бывшей жены Грачевского.

Россиянам разъяснили новые правила оформления выплат на детей
    =>  В России объяснили новую систему выплат на детей.
    =>  У россиян появилась возможность оформить выплаты на детей.
    =>  РФ одобрила введение новых правил для россиян.
    =>  Появились новые правила оформления выплат на детей.
    =>  Власти объяснили новые правила оформления выплат на детей.
    =>  В России объяснили правила оформления выплат на детей.

Рос

Россиян предупредили о погодных аномалиях в течение недели
    =>  У жителей России выросли тарифы на электричество.

На поддержку рыбохозяйств в Подмосковье выделено 22 миллиона рублей
    =>  В Подмосковье задержан высокопоставленный чиновник.

Американцам предложат включить в рацион мясо питонов
    =>  В России объяснили отказ от мяса питонов.
    =>  Запрет на мясо питонов объяснили.
    =>  Назван новый тип мяса питонов.
    =>  Появились подробности отказа от мяса питонов.
    =>  В России предложили отказаться от мяса питонов.
    =>  Российские ученые объяснили отказ от мяса питонов.
    =>  Названа причина запрета на мясо питонов.
    =>  США ответили на предложение запретить мясо питонов.

Подсчитан заработок торгующих на бирже россиян
    =>  Задержаны участники торгов на бирже.
    =>   Зарплату россиян назвали обманом.
    =>  Заработок россиян на бирже россиян упал.
    =>  В России оценили заработки продавцов на бирже.
    =>  В Госдуме оценили работу россиян за счет ре

In [48]:
with open('gpt-gen-raw.tsv', 'a', encoding='utf-8') as ofh:
    for idx, item in enumerate(new_titles):
        if not idx%1000: print(idx, item)
        if '.' in item: continue
        for _ in range(10):
            res = simple_filter(sample(model, tokenizer, item, 64))
            if res:
                for r in res:
                    print(f'{item}\t{r}', file=ofh, flush=True)
                break


0 Состояние итальянского футболиста после кровоизлияния в мозг назвали критическим
1000 Пандемия коронавируса вызвала эпидемию психических расстройств
2000 Кинокритики выбрали лучший фильм 2019 года
3000 Мужчина попытался проникнуть в здание методом Санта-Клауса и застрял
4000 ФСБ изъяла более 100 золотых слитков при обысках по делу о хищении медпрепаратов


KeyboardInterrupt: 

#### I've stopped this after ~5K left titles (but not each of them has any results) 
#### Now we have ~39K generated pairs with 3K uniq left titles

## One of the problems -- sometimes it uses corefs from right side to the left one.
## I.e.:
#### США пошли на экстренные меры из-за коронавируса  =>  В России объяснили такое поведение США.
## The easiest solution for this and other problems -- to train an external filter-discriminator. We'll use only right parts

In [18]:
import csv
import numpy as np

rights = []
with open("ru_news_cause_v1.tsv", "r", encoding='utf-8') as r:
    reader = csv.reader(r, delimiter="\t")
    header = next(reader)
    for row in reader:
        r = dict(zip(header, row))
        if float(r["confidence"]) < 0.69:
            continue
        result = r["result"]
        if result == 'left_right_cause':
            rights.append( {'title':r['right_title'], 'label':1 } )
        elif result == 'left_right_cancel':
            rights.append( {'title':r['right_title'], 'label':1 } )
        elif result == 'right_left_cause':
            rights.append( {'title':r['left_title'], 'label':1 } )
        elif result == 'right_left_cancel':
            rights.append( {'title':r['left_title'], 'label':1 } )
print(len(rights))

fake_rights = []
for line in open('gpt-gen-raw.tsv', encoding='utf-8'):
    try:
        left, right = line.strip('\n').split('\t')
    except:
        continue
    fake_rights.append( right.strip().strip('.') )
    
np.random.seed(1337)
np.random.shuffle(fake_rights)
print(len(fake_rights))
print(fake_rights[:10])

sample_size = len(rights)
for item in fake_rights[:sample_size]:
    rights.append( {'title':item, 'label':0 } )
print(len(rights))
np.random.shuffle(rights)

1469
39327
['Глава ЦБ отреагировал на снижение евро', 'В Свердловской области опровергли информацию о ремонте дорог', 'Россия ответила на запуск поезда в Крым', 'Россия отреагировала на отмену западных санкций', 'Песков отреагировал на слова о частной клинике', 'Россия отреагировала на снижение курса евро до 90 рублей', 'Армия Азербайджана заняла Агдамский район', 'Иран обвинил США в нарушении воздушного пространства', 'В США прокомментировали обращение Мясникова к людям', 'В России оценили рост цен на нефть']
2938


In [19]:
import torch
from torch.utils.data import Dataset

class NewsSinglesDataset(Dataset):
    def __init__(self, records, max_tokens, model_name, labels_count):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=False
        )
        self.max_tokens = max_tokens
        self.records = records
        self.labels_count = labels_count
    
    def __len__(self):
        return len(self.records)
    
    def embed_record(self, record):
        inputs = self.tokenizer(
            text=record["title"],
            add_special_tokens=True,
            max_length=self.max_tokens,
            padding="max_length",
            truncation="longest_first",
            return_tensors='pt'
        )
        for key, value in inputs.items():
            value.squeeze_(0)
        return inputs
    
    def __getitem__(self, index):
        record = self.records[index]
        output = self.embed_record(record)
        label = record.get("label", None)
        if label is not None:
            output["labels"] = torch.tensor(label)
        return output

In [20]:
rights_train_records, rights_val_records, rights_test_records = [], [], []
val_border = int(0.8 * len(rights))
test_border = int(0.9 * len(rights))
rights_train_records.extend(rights[:val_border])
rights_val_records.extend(rights[val_border:test_border])
rights_test_records.extend(rights[test_border:])

print(len(rights_train_records))
print(len(rights_val_records))
print(len(rights_test_records))


2350
294
294


In [21]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
DISCR_MODEL_NAME = "DeepPavlov/rubert-base-cased"
DISCR_TOKENIZER_NAME = DISCR_MODEL_NAME
device = 'cuda'
cache_dir = '/media/altsoph/Volume/_transformers_cache/'
MAX_TOKENS = 80

rights_model = AutoModelForSequenceClassification.from_pretrained(DISCR_MODEL_NAME, num_labels=2, cache_dir=cache_dir)
rights_model = rights_model.to("cuda")

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /media/altsoph/Volume/_transformers_cache/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_v

In [22]:
rights_train_data = NewsSinglesDataset(rights_train_records, MAX_TOKENS, DISCR_TOKENIZER_NAME, 2)
rights_val_data = NewsSinglesDataset(rights_val_records, MAX_TOKENS, DISCR_TOKENIZER_NAME, 2)

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transforme

In [23]:
from transformers import Trainer, TrainingArguments

EPOCHS = 5
EVAL_STEPS = 10 #*8
WARMUP_STEPS = 5 # *8
LR = 3e-05
BATCH_SIZE = 128//4 # 
GRAD_ACCUM_STEPS = 1*4

discr_training_args = TrainingArguments(
    output_dir="rights_discriminator",
    overwrite_output_dir=True, #overwrite the content of the output directory
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    report_to="none",
#     report_to="wandb",  # enable logging to W&B
#     run_name="newscausation_basetask",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=rights_model,
    args=discr_training_args,
    train_dataset=rights_train_data,
    eval_dataset=rights_val_data
)

trainer.train()

PyTorch: setting up devices
***** Running training *****
  Num examples = 2350
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 90


Step,Training Loss,Validation Loss
10,0.6382,0.488425
20,0.4861,0.378773
30,0.3916,0.334482
40,0.3516,0.321026
50,0.2813,0.297469
60,0.2422,0.279285
70,0.227,0.28004
80,0.2096,0.264115
90,0.1781,0.270699


***** Running Evaluation *****
  Num examples = 294
  Batch size = 32
Saving model checkpoint to rights_discriminator/checkpoint-10
Configuration saved in rights_discriminator/checkpoint-10/config.json
Model weights saved in rights_discriminator/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 294
  Batch size = 32
Saving model checkpoint to rights_discriminator/checkpoint-20
Configuration saved in rights_discriminator/checkpoint-20/config.json
Model weights saved in rights_discriminator/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 294
  Batch size = 32
Saving model checkpoint to rights_discriminator/checkpoint-30
Configuration saved in rights_discriminator/checkpoint-30/config.json
Model weights saved in rights_discriminator/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 294
  Batch size = 32
Saving model checkpoint to rights_discriminator/checkpoint-40
Configuration saved in rights_disc

TrainOutput(global_step=90, training_loss=0.33397502899169923, metrics={'train_runtime': 177.2207, 'train_samples_per_second': 66.302, 'train_steps_per_second': 0.508, 'total_flos': 999175038005760.0, 'train_loss': 0.33397502899169923, 'epoch': 4.97})

In [29]:
rights_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [25]:
test_data = NewsSinglesDataset(rights_test_records, MAX_TOKENS, DISCR_TOKENIZER_NAME, 2)
y_true = [item["labels"].item() for item in test_data]
y_true = np.array(y_true, dtype=np.int32)
print(y_true)


loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transforme

[0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1
 1 1 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0
 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 0 1 0 1 1 0 0 0 1 0
 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1
 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 1 1 0 0
 1 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1
 1 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1]


In [26]:
y_pred = []
with torch.no_grad():
    for item in test_data:
        for key, value in item.items():
            item[key] = value.unsqueeze_(0).cuda()
        outputs = rights_model(**item, return_dict=True)
        pred = torch.argmax(outputs.logits).item()
        y_pred.append(pred)
y_pred = np.array(y_pred)
print(y_pred)

[0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 1
 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 1 0 0 0 1 1 0 1
 0 1 1 0 1 1 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1 1 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 0 0 1 1 0
 0 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1
 1 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 1 0 0
 1 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 1
 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 0 1 1]


In [27]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_true, y_pred))
confusion_matrix(y_true, y_pred)


              precision    recall  f1-score   support

           0       0.92      0.89      0.90       154
           1       0.88      0.91      0.90       140

    accuracy                           0.90       294
   macro avg       0.90      0.90      0.90       294
weighted avg       0.90      0.90      0.90       294



array([[137,  17],
       [ 12, 128]])

In [28]:
raw_pairs = []
markup_inputs = []
for line in open('gpt-gen-raw.tsv', encoding='utf-8'):
    try:
        left, right = line.strip('\n').split('\t')
    except:
        continue
    raw_pairs.append( (left,right) )
    markup_inputs.append( {'title':right.strip().strip('.'), 'label':0 }  )
   

infer_data = NewsSinglesDataset(markup_inputs, MAX_TOKENS, DISCR_TOKENIZER_NAME, 2)

loading configuration file https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/config.json from cache at /home/altsoph/.cache/huggingface/transformers/a43261a78bd9edbbf43584c6b00aa94c032301840e532839cb5989362562a5d5.e8f15c5aad2f4653e46ceeba0bb32c02a629d106a902c964bce60523d290ac8f
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transforme

In [31]:
y_pred = []
with torch.no_grad():
    for item in infer_data:
        for key, value in item.items():
            item[key] = value.unsqueeze_(0).cuda()
        outputs = rights_model(**item, return_dict=True)
        pred = torch.argmax(outputs.logits).item()
        y_pred.append(pred)
y_pred = np.array(y_pred)
print(y_pred)

[0 1 0 ... 0 0 1]


In [33]:
print(len(raw_pairs))
print(len(y_pred))
print(np.sum(y_pred))

39327
39327
4118


In [35]:
with open('gpt-gen-filtered-rights.tsv', 'a', encoding='utf-8') as ofh:
    for label,pair in zip(y_pred, raw_pairs):
        if label:
            print(f'{pair[0]}\t{pair[1]}', file=ofh, flush=True)


## check the generated pairs


In [1]:
from transformers import pipeline
import random
import torch
import numpy as np
import os
import random
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

def pipe_predict(data, batch_size=64):
    raw_preds = pipe(data, batch_size=batch_size)
    preds = np.array([int(max(labels, key=lambda x: x["score"])["label"][-1]) for labels in raw_preds])
    pp = np.array([[l["score"] for l in labels] for labels in raw_preds])
    return preds, pp
    
set_random_seed(1337)

# MODEL_NAME = "DeepPavlov/rubert-base-cased"
MODEL_NAME = "xlm-roberta-large"
TOKENIZER_NAME = MODEL_NAME

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
MODEL_NAME = "./bertcause-task0/checkpoint-192"
MAX_TOKENS = 80
cache_dir = '/media/altsoph/Volume/_transformers_cache/'

# train_records = ru_train_records # + en_train_records
# val_records = ru_val_records #+ en_val_records
# random.shuffle(train_records)

# train_data = NewsPairsDataset(train_records, tokenizer, MAX_TOKENS, labels_count)
# val_data = NewsPairsDataset(val_records, tokenizer, MAX_TOKENS, labels_count)
3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model = model.to("cuda")


model.eval()
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, framework="pt", device=0, return_all_scores=True)

infer_records = []
for line in open('gpt-gen-filtered-rights.tsv', encoding='utf-8'):
    left, right = line.strip().split('\t')
    right = right.strip().strip('.')
    infer_records.append( {'left_title': left, 'right_title': right, 'label':-1})
# y_pred = pipe_predict()[0]
# infer_data = NewsPairsDataset(infer_records, tokenizer, MAX_TOKENS, labels_count)
# y_pred
# infer_records
# ru_test_pairs
infer_pairs = [(r["left_title"], r["right_title"]) for r in infer_records]
# infer_pairs
ru_y_pred = pipe_predict(infer_pairs)[0]
ru_y_pred

  '"sox" backend is being deprecated. '
2021-08-09 10:03:56.149129: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


array([2, 2, 0, ..., 0, 0, 2])

In [2]:
from collections import Counter
Counter(ru_y_pred)

Counter({2: 2153, 0: 1823, 1: 142})

In [8]:
code2label = {0:'other',1:'right2left',2:'left2right'}
with open('gpt-gen-filtered-rights-markup.tsv', 'a', encoding='utf-8') as ofh:
    for label, pair in zip(ru_y_pred, infer_pairs):
        print(f"{pair[0]}\t{pair[1]}\t{code2label[label]}", file=ofh, flush=True)
    