<a href="https://colab.research.google.com/github/Gaussiandra/AI-21-qual/blob/main/solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install neptune-notebooks
!jupyter nbextension enable --py neptune-notebooks
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
!pip install transformers
!pip install catalyst
!pip install neptune-client

In [None]:
!free -m
!nvidia-smi

              total        used        free      shared  buff/cache   available
Mem:        1546833       78117       87587         733     1381128     1461338
Swap:             0           0           0
Sun Feb 28 17:06:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.165.02   Driver Version: 418.165.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:5E:00.0 Off |                    0 |
| N/A   49C    P0    51W / 350W |      0MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------------

In [None]:
global_path = '/home/jovyan/AI_21/'

experiment_params = {
    'MODEL_NAME': 'DeepPavlov/rubert-base-cased',
    'MAX_LEN': 512,
    'EFFECTIVE_BATCH_SIZE': 16,
    'BATCH_SIZE': 16,
    'RANDOM_SEED': 665,
    'NUM_CLASSES': 2,
    'LEARNING_RATE': 5e-5,
    'CHANGE_QUESTION_PROBA': 0.05,
    'CHANGE_ANSWER_PROBA': 0.05,
}

In [None]:
import json
import os
import sys
import re
import random
from tqdm.auto import tqdm
from collections import defaultdict

import numpy as np
import torch
import torch.nn as nn
import neptune
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

from catalyst.dl import SupervisedRunner
from catalyst.core.callback import Callback, CallbackNode, CallbackOrder
from catalyst.utils import prepare_cudnn, set_global_seed
from catalyst import utils, dl

In [None]:
raw_train = [json.loads(s) for s in open(global_path+'data/train.jsonl')]
raw_test = [json.loads(s) for s in open(global_path+'data/test.jsonl')]
raw_val = [json.loads(s) for s in open(global_path+'data/val.jsonl')]

### Statistics

In [None]:
len(raw_train), len(raw_test), len(raw_val)

(500, 322, 100)

In [None]:
raw_train[43]['passage']['text']

'"(1) Лидер незарегистрированной партии "Другая Россия" Эдуард Лимонов обратился в Конституционный суд РФ с жалобой на новый закон о митингах. (2) Об этом 9 октября сообщает агентство "Интерфакс". (3) "У меня есть слабая надежда. (4) Закон о митингах противоречит Конституции", - заявил Лимонов. (5) По его мнению, среди неконституционных пунктов закона - запрет на организацию митингов людьми, которые более двух раз привлекались к административной ответственности за нарушение правил предвыборной агитации, неповиновение полиции, хулиганство, блокирование движения транспорта или производство экстремистских материалов. (6) Закон о митингах был применён лично против Лимонова. (7) Его оштрафовали на 12 тысяч рублей за акцию на Триумфальной площади 31 июля. (8) Федеральный закон "О собраниях, митингах, демонстрациях, шествиях и пикетированиях" вступил в силу 9 июня 2012 года. (9) О намерении оспорить закон в КС практически сразу объявили думские фракции "Справедливой России" и КПРФ. (10) В ито

In [None]:
n_questions, n_answers, n_corrects = 0, 0, 0
texts_lens = []
for cur_q in raw_train:
    texts_lens.append(len(cur_q['passage']['text']))
    for question in cur_q['passage']['questions']:
        n_questions += 1
        n_answers += len(question['answers'])
        n_corrects += sum(i['label'] for i in question['answers'])
print(n_questions, n_answers, n_answers / n_corrects)
print(np.mean(texts_lens))

2897 11950 2.220364176885916
1380.538


### Datasets

In [None]:
class QADataset(Dataset):
    def __init__(
        self, 
        samples, 
        model_name, 
        max_len,
        **kwargs
    ):
        self.model_name = model_name
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.kwargs = kwargs

        # text, question, answer
        self.raw_triplets = []
        self.labels = []

        for sample in samples:
            text = sample['passage']['text']
            text = re.sub('\(\d+\)', '', text)

            for task in sample['passage']['questions']:
                question = task['question']
                for answer_info in task['answers']:
                    answer = answer_info['text']
                    if 'label' in answer_info:
                        label = answer_info['label']
                        self.labels.append(label)
                    self.raw_triplets.append([text, question, answer])

    def __len__(self):
        return len(self.raw_triplets)
    
    def __getitem__(self, index):
        text, question, answer = self.raw_triplets[index]

        is_augmented_sample = False
        if ('CHANGE_QUESTION_PROBA' in self.kwargs and
            'CHANGE_ANSWER_PROBA' in self.kwargs):
            if random.random() <= self.kwargs['CHANGE_QUESTION_PROBA']:
                question = random.choice(self.raw_triplets)[1]
                is_augmented_sample = True

            if random.random() <= self.kwargs['CHANGE_ANSWER_PROBA']:
                answer = random.choice(self.raw_triplets)[2]
                is_augmented_sample = True

        tokenization = self.tokenizer([text, question, answer])

        remain_text_part = (
            self.max_len - 
            len(tokenization['input_ids'][1][1:]) - # question
            len(tokenization['input_ids'][2][1:])   # answer
        )
        remain_text_part = min(
            remain_text_part, 
            len(tokenization['input_ids'][0])
        )
        assert remain_text_part > 0

        # concatenate tokenized triplets
        tokenization['input_ids'] = (
            tokenization['input_ids'][0][:remain_text_part - 1] +
            [self.tokenizer.sep_token_id] +
            tokenization['input_ids'][1][1:] +
            tokenization['input_ids'][2][1:]
        )
        if 'token_type_ids' in tokenization:
            tokenization['token_type_ids'] = (
                tokenization['token_type_ids'][0][:remain_text_part] + 
                [1] * len(tokenization['token_type_ids'][1][1:]) +
                [1] * len(tokenization['token_type_ids'][2][1:])
            )
            assert len(tokenization['input_ids']) == len(tokenization['token_type_ids'])
        tokenization['attention_mask'] = [1] * len(tokenization['input_ids'])
        assert len(tokenization['input_ids']) == len(tokenization['attention_mask'])

        for k, v in tokenization.items():
            tokenization[k] = torch.tensor(np.pad(
                v,
                (0, self.max_len - len(v)), 
                constant_values=self.tokenizer.pad_token_id
            ))
            assert tokenization[k].shape[0] == self.max_len

        if self.labels:
            if is_augmented_sample:
                tokenization['targets'] = torch.tensor(0)
            else:
                tokenization['targets'] = torch.tensor(self.labels[index])

        return tokenization

In [None]:
train_qa_dataset = QADataset(
    samples=raw_train,
    model_name=experiment_params['MODEL_NAME'], 
    max_len=experiment_params['MAX_LEN'],
    CHANGE_QUESTION_PROBA=experiment_params['CHANGE_QUESTION_PROBA'],
    CHANGE_ANSWER_PROBA=experiment_params['CHANGE_ANSWER_PROBA'],
)
val_qa_dataset = QADataset(
    samples=raw_val, 
    model_name=experiment_params['MODEL_NAME'], 
    max_len=experiment_params['MAX_LEN'],
    CHANGE_QUESTION_PROBA=experiment_params['CHANGE_QUESTION_PROBA'],
    CHANGE_ANSWER_PROBA=experiment_params['CHANGE_ANSWER_PROBA'],
)
test_qa_dataset = QADataset(
    samples=raw_test, 
    model_name=experiment_params['MODEL_NAME'], 
    max_len=experiment_params['MAX_LEN'],
    CHANGE_QUESTION_PROBA=experiment_params['CHANGE_QUESTION_PROBA'],
    CHANGE_ANSWER_PROBA=experiment_params['CHANGE_ANSWER_PROBA'],
)

In [None]:
train_val_dataloaders = {
    'train': DataLoader(
        dataset=train_qa_dataset,
        batch_size=experiment_params['BATCH_SIZE'], 
        shuffle=True,
        num_workers=2,
        pin_memory=True
    ),
    'valid': DataLoader(
        dataset=val_qa_dataset, 
        batch_size=experiment_params['BATCH_SIZE'], 
        shuffle=False,
        num_workers=2,
        pin_memory=False,
    )
}
test_dataloader = {
    'infer': DataLoader(
        dataset=test_qa_dataset, 
        batch_size=experiment_params['BATCH_SIZE'], 
        shuffle=False,
    )
}

### Model and training

In [None]:
class QAModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()

        self.model = AutoModel.from_pretrained(model_name)
        self.linear = nn.Linear(self.model.config.hidden_size, num_classes)
        self.dropout = nn.Dropout(0.15)

    def forward(self, kwargs):
        x = self.model(**kwargs).pooler_output
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [None]:
model = QAModel(experiment_params['MODEL_NAME'], experiment_params['NUM_CLASSES'])
optimizer = torch.optim.AdamW(model.parameters(), lr=experiment_params['LEARNING_RATE'], amsgrad=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor=0.25)

In [None]:
for n, p in model.named_parameters():
    if any([n.startswith(s) for s in [
        'model.encoder.layer.22',
        'model.encoder.layer.23', 
        'model.pooler', 
        'linear'
    ]]):
        p.requires_grad = True
    else:
        p.requires_grad = False


class FullUnreezeCallback(Callback):
    def __init__(self, batches_to_unfreeze):
        super().__init__(order=CallbackOrder.Metric + 1, node=CallbackNode.All)

        self.batches_to_unfreeze = batches_to_unfreeze
        self.cur_iteration = 0
        self.is_unfrozen = False

    def on_batch_end(self, runner):
        if runner.is_train_loader:
            self.cur_iteration += 1
            if (self.cur_iteration >= self.batches_to_unfreeze and 
                not self.is_unfrozen):
                for p in runner.model.parameters():
                    p.requires_grad = True

                self.is_unfrozen = True
                print('Model was unfrozen.')

In [None]:
neptune_logger = dl.NeptuneLogger(
    api_token='-',
    project_name='gaussiandra/ai21-2-test',
    offline_mode=False,
    name='example',
    params=experiment_params,
    tags=['test'],
    upload_source_files=['*.ipynb'], 
)

https://ui.neptune.ai/gaussiandra/ai21-2-test/e/AIT-3


In [None]:
set_global_seed(experiment_params['RANDOM_SEED'])
prepare_cudnn(deterministic=True, benchmark=True)

In [None]:
logdir = global_path+'logs/test02'

In [None]:
class CustomRunner(dl.Runner):
    def predict_batch(self, batch):
        x = dict(batch)
        del x['targets']

        return self.model(x)
    
    def _handle_batch(self, batch):
        x = dict(batch)
        del x['targets']
        y = batch['targets']

        logits = self.model(x)

        self.state.input = {"features": x, "targets": y}
        self.state.output = {"logits": logits}

runner = CustomRunner()

In [None]:
runner.train(
    model=model,
    criterion=nn.CrossEntropyLoss(),
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=train_val_dataloaders,
    callbacks=[
        dl.AccuracyCallback(num_classes=experiment_params['NUM_CLASSES']),
        dl.OptimizerCallback(
            accumulation_steps=(
                experiment_params['EFFECTIVE_BATCH_SIZE'] //
                experiment_params['BATCH_SIZE']
            ),
            use_fast_zero_grad=True
        ),
        dl.F1ScoreCallback(),
        dl.CheckpointCallback(
            save_n_best=0,
            #resume=global_path+'logs/test28/checkpoints/last_full.pth'
        ),
        #FullUnreezeCallback(50),
        dl.CriterionCallback(),
        dl.SchedulerCallback(reduced_metric='loss'),
        dl.EarlyStoppingCallback(3),
        neptune_logger
    ],
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    fp16=True,
    timeit=False,
)

### Prediction

In [None]:
runner.infer(
    model=model,
    loaders=test_dataloader,
    callbacks=[InferCallback()],
    verbose=True,
    resume='/content/drive/MyDrive/ML/AI 21/logs/test14/checkpoints/train.2.pth'
)
assert len(runner.callbacks[0].predictions['logits']) == 7614

In [None]:
for s, sample in enumerate(submit_example):
    for t, task in enumerate(sample['passage']['questions']):
        for a, answer_info in enumerate(task['answers']):           
            submit_location = submit_example[s]['passage']['questions'][t]['answers'][a]
            final_answer = np.argmax(runner.callbacks[0].predictions['logits'][answer_info['idx']])

            submit_location['label'] = int(final_answer)

In [None]:
json_lines = [json.dumps(s) for s in submit_example]
open(global_path+'data/submit.jsonl', 'w').write('\n'.join(json_lines))