In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os
import random
from tqdm import tqdm
from torch.utils.data import TensorDataset
import re
import string
from transformers import AdamW

In [None]:
def seed_everything(seed = 1234):
     random.seed(seed)
     os.environ['PYTHONHASHSEED'] = str(seed)
     np.random.seed(seed)     
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True

In [None]:
seed_everything()

In [None]:
raw_model = 'sberbank-ai/ruT5-base'#'cointegrated/rut5-base-paraphraser'
model = T5ForConditionalGeneration.from_pretrained(raw_model, output_hidden_states=False)
tokenizer = T5Tokenizer.from_pretrained(raw_model)

In [None]:
DEVICE = 'cuda:0'

In [None]:
def convert_to_dataset_torch(masked: pd.Series, source: pd.Series, tokenizer,
                             DEVICE) -> TensorDataset:
    input_ids1 = []
    input_ids2 = []
    att_masks1 = []
    att_masks2 = []
    
    for mask, sour in tqdm(zip(masked, source)):
        encoded_dict1 = tokenizer.encode_plus(mask, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids1.append(encoded_dict1['input_ids'])
        att_masks1.append(encoded_dict1['attention_mask'])

        encoded_dict2 = tokenizer.encode_plus(sour, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids2.append(encoded_dict2['input_ids'])
        att_masks2.append(encoded_dict2['attention_mask'])


    input_ids1 = torch.cat(input_ids1, dim=0)
    input_ids2 = torch.cat(input_ids2, dim=0)
    att_masks1 = torch.cat(att_masks1, dim=0)
    att_masks2 = torch.cat(att_masks2, dim=0)
  

    input_ids1.to(dtype=torch.long)
    input_ids2.to(dtype=torch.long)
    att_masks1.to(dtype=torch.long)
    att_masks2.to(dtype=torch.long)
    

    return TensorDataset(input_ids1, input_ids2, att_masks1, att_masks2) 

In [None]:
df = pd.read_csv('../input/pretrain/pretrainnew.csv')

In [None]:
dataset = convert_to_dataset_torch(df.comm1, df.comm2, tokenizer, DEVICE)

In [None]:
bs = 4
loader = DataLoader(dataset, batch_size=bs, num_workers=0, shuffle=True)
adamw_optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

In [None]:
def training(model, optimizer):
    model.to(DEVICE)
    dataloader = loader
    running_loss = 0.
    epochs = 1
    for epoch in range(epochs):
        print(epoch)
        for batch in tqdm(dataloader):
                maskinps, sourceinps, maskmask, sourcemask = batch
                maskinps, maskmask, sourceinps, sourcemask = maskinps.to(DEVICE), maskmask.to(DEVICE), sourceinps.to(DEVICE), sourcemask.to(DEVICE)
                y = sourceinps
                lm_labels = y[:, 1:].clone().detach()
                lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
                loss =  model(input_ids=maskinps, attention_mask=maskmask, labels=lm_labels).loss              
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                print(loss.item())
        epoch_loss = running_loss / len(dataloader)

        print('Loss: ' + str(epoch_loss))
    return model

In [None]:
training(model, adamw_optimizer)

In [None]:
torch.save(model, 'pretrainedT5.pth')