In [None]:
! pip -qq install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import gc
from tqdm import tqdm
import json
from pathlib import Path
from torch.utils.data import DataLoader
from transformers import  AutoModelForQuestionAnswering, AutoTokenizer, AdamW, get_cosine_schedule_with_warmup

In [None]:
class Config():
    num_epochs = 4
    learning_rate = 4e-5
    model_name = "sberbank-ai/ruRoberta-large"
    batch_size = 2

In [None]:
%%capture
! wget https://www.dropbox.com/sh/zhez7s5erqogogo/AAAf_lhCbjkMSA0hNEpQipj6a?dl=0 --content-disposition
! unzip dataset.zip

In [None]:
def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for passage in squad_dict['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            for answer in qa['answers']:
                contexts.append(context)
                questions.append(question)
                answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('sbersquad_train_clean_final.json')
val_contexts, val_questions, val_answers = read_squad('sbersquad_dev_clean_final.json')

In [None]:
def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)
    answer['answer_end'] = end_idx
    
    if context[start_idx:end_idx] == gold_text:
        answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1     
    elif context[start_idx-2:end_idx-2] == gold_text:
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2     

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
model_name = Config.model_name
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=Config.max_len)

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []

    count = 0

    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = Config.max_len-1
          
        if end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'])
            
            if end_positions[-1] is None:
              count += 1
              end_positions[-1] = Config.max_len-1

    print(count)
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
def loss_fn(preds, labels):
    start_preds, end_preds = preds
    start_labels, end_labels = labels
    
    start_loss = nn.CrossEntropyLoss(ignore_index=-1)(start_preds, start_labels)
    end_loss = nn.CrossEntropyLoss(ignore_index=-1)(end_preds, end_labels)
    total_loss = (start_loss + end_loss) / 2
    return total_loss

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = AutoModelForQuestionAnswering.from_pretrained(model_name) 
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=Config.batch_size, shuffle=False)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
def validate(valid_loader, model):
    model.eval()
    loss_hist = 0
    torch.cuda.empty_cache()
    gc.collect()  
    for batch in tqdm(valid_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        labels = torch.stack((start_positions, end_positions))
        out = model(input_ids, attention_mask=attention_mask)
        start, end = out['start_logits'], out['end_logits']
        loss = loss_fn(torch.stack((start, end)), labels).detach().to('cpu')

        loss_hist += loss
    print('...................')
    print(f'Validation loss: {loss_hist/len(valid_loader)}')


def train_epoch(train_loader, optim, model, scheduler):
    print("Epoch %s of %s" %(epoch + 1, Config.num_epochs))
    epoch_loss = 0
    i=0
    for batch in tqdm(train_loader):
        gc.collect()
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        scheduler.step()
        epoch_loss += loss.detach().to('cpu')

        if i % 300 == 0:
            print(f'Step: {i}, loss: {loss}')
        i+=1
    print('.......................................')
    print(f"Loss: {epoch_loss / len(train_loader)}")

In [None]:
optim = AdamW(model.parameters(), lr=Config.learning_rate)

max_train_steps = len(train_loader)*Config.num_epochs
scheduler = get_cosine_schedule_with_warmup(
            optim,
            num_warmup_steps=max_train_steps // 50,
            num_training_steps=max_train_steps
            )

model.train()

for epoch in range(Config.num_epoch):
    print(f'====== EPOCH #{epoch} ======')
    train_epoch(train_loader, optim, model, scheduler)
    validate(val_loader, optim, model)

In [None]:
# load test set
test_contexts, test_questions, test_answers = read_squad('sbersquad_test_data_to_solve.json')
add_end_idx(test_answers, test_contexts)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)
test_dataset = SquadDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=False)

In [None]:
# get answers on test set
model.eval()
torch.cuda.empty_cache()
gc.collect()
start_ids = []
end_ids = []
for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    start_ids.append(torch.softmax(outputs[0], dim = 1).detach().to('cpu').numpy())
    end_ids.append(torch.softmax(outputs[1], dim = 1).detach().to('cpu').numpy())
    

In [None]:
# post-process answers
start = np.concatenate(start_ids, axis = 0)
end = np.concatenate(end_ids, axis = 0)
print(start.shape)

answer_starts = np.argmax(start, axis=1)  
answer_ends = np.argmax(end, axis=1) + 1  

answers = [tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(test_encodings['input_ids'][i][answer_start:answer_end])) \
          for i, answer_start, answer_end in zip(list(range(len(test_encodings['input_ids']))), answer_starts, answer_ends)]

In [None]:
# create submission
def process_json(df, answers, ansers_start):
  l= []
  k = 0
  for i in range(len(df)):
    for qst in range(len(df['paragraphs'][i]['qas'])):
      for answer in range(len(df['paragraphs'][i]['qas'][qst]['answers'])):
        df['paragraphs'][i]['qas'][qst]['answers'][answer]['answer_start'] = int(answer_starts[k])
        df['paragraphs'][i]['qas'][qst]['answers'][answer]['text'] = answers[k]
        k+=1

  return df


test_df = pd.read_json("sbersquad_test_data_to_solve.json")
new_df = process_json(test_df, answers, answer_starts)
sub = {}
sub['title'] = list(new_df['title'].values)
sub['paragraphs'] = list(new_df['paragraphs'].values)
with open('submission.json', 'w') as f:
    json.dump(sub, f)

In [None]:
! zip submission.zip submission.json