In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
from tqdm import tqdm
from transformers import AdamW
from torch.nn import CrossEntropyLoss

In [None]:
raw_model = 'cointegrated/rut5-small'
model = T5ForConditionalGeneration.from_pretrained(raw_model, output_hidden_states=False)
tokenizer = T5Tokenizer.from_pretrained(raw_model)
berttokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
bertmodel = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')

In [None]:
text = 'тупой ты дурак'
sattr = 'toxic: '
dattr = 'civil: '
encoded = berttokenizer.encode_plus(text, max_length=400,
                                              pad_to_max_length=True,
                                              return_attention_mask=False, return_tensors='pt', truncation=True)
sattr = berttokenizer.encode_plus(sattr, max_length=400,
                                              pad_to_max_length=True,
                                              return_attention_mask=False, return_tensors='pt', truncation=True)

In [None]:
bertmodel

In [None]:
#enc = model.encoder.embed_tokens(encoded.input_ids)
bertenc = bertmodel.bert.embeddings(encoded.input_ids)

In [None]:
bertmodel.bert.encoder(bertenc).last_hidden_state[:, 0, :].shape

In [None]:
h0 = model.encoder(inputs_embeds=enc).last_hidden_state[:, 0, :]

In [None]:
model.dense = torch.nn.Linear(768, 768)

In [None]:
df = pd.read_csv('../input/2500selfsupervised/comms.csv')

In [None]:
DEVICE = 'cuda:0'

In [None]:
def dae(inputs: torch.Tensor) -> torch.Tensor:  # This function randomly masks and unmasks tokens #(Ошибка по прежнему здесь. Функция делает свои дела inplace)
    rand = torch.rand(inputs.shape)
    rand = rand.to(DEVICE)
    mask_arr = (rand <= 0.15) * (inputs != 101) * (inputs != 102) * (inputs != 0) * (inputs != 1)
    selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
    inputs[0, selection] = torch.randint_like(inputs[0, selection], low=32000, high=32100)

    rand = torch.rand(inputs.shape)
    rand = rand.to(DEVICE)
    mask_arr = (rand <= 0.1) * (inputs <= 32100) * (inputs >= 32000)
    selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
    inputs[0, selection] = torch.randint_like(inputs[0, selection], low=1, high=31999)
    return inputs

In [None]:
def waydae(comm, isxod, sattr):
    nx = dae(comm)
    xemb = model.shared(nx)
    henc = model.encoder(inputs_embeds=xemb)
    z = henc.last_hidden_state[:, 0, :]
    z = model.dense(z)
    h = z.view(4, 1, 512)#768 for base, 512 for small
    emb = model.shared(torch.cat((sattr, isxod), 1))      
    hdec = model.decoder(inputs_embeds=torch.add(emb, h))            
    xdae = model.lm_head(hdec.last_hidden_state)

In [None]:
def convert_to_dataset_torch(comment: pd.Series, sattribute: pd.Series, dattribute: pd.Series, tokenizer,
                             DEVICE) -> TensorDataset:
    input_ids1 = []
    input_ids2 = []
    input_ids3 = []
    
    for comm, sattr, dattr in tqdm(zip(comment, sattribute, dattribute)):
        encoded_dict1 = tokenizer.encode_plus(comm, max_length=400,
                                              pad_to_max_length=True,
                                              return_attention_mask=False, return_tensors='pt', truncation=True)
        input_ids1.append(encoded_dict1['input_ids'])

        encoded_dict2 = tokenizer.encode_plus(sattr, max_length=15,
                                              pad_to_max_length=True,
                                              return_attention_mask=False, return_tensors='pt', truncation=True, add_special_tokens=False)
        input_ids2.append(encoded_dict2['input_ids'])

        encoded_dict3 = tokenizer.encode_plus(dattr, max_length=15,
                                              pad_to_max_length=True,
                                              return_attention_mask=False, return_tensors='pt', truncation=True, add_special_tokens=False)
        input_ids3.append(encoded_dict3['input_ids'])

    input_ids1 = torch.cat(input_ids1, dim=0)
    input_ids2 = torch.cat(input_ids2, dim=0)
    input_ids3 = torch.cat(input_ids3, dim=0)

    input_ids1.to(dtype=torch.long)
    input_ids2.to(dtype=torch.long)
    input_ids3.to(dtype=torch.long)

    return TensorDataset(input_ids1, input_ids2, input_ids3)


In [None]:
dataset = convert_to_dataset_torch(df.comment, df.sattribute, df.dattribute, tokenizer, DEVICE)

In [None]:
bs = 4
loader = DataLoader(dataset, batch_size=bs, num_workers=0, shuffle=True)
adamw_optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

In [None]:
def training(model, optimizer):
    ldae = 1
    lcc = 1
    model.to(DEVICE)
    dataloader = loader
    running_loss = 0.
    epochs = 3
    celoss = CrossEntropyLoss()
    for epoch in range(epochs):
        print(epoch)
        for batch in tqdm(dataloader):
                comm, sattr, dattr = batch  # Receiving comment, source attribute and destination attribute from batch
                comm, sattr, dattr = comm.to(DEVICE), sattr.to(DEVICE), dattr.to(DEVICE)
                isxod = comm.detach().clone()
                optimizer.zero_grad()
                # CC
                inputs2 = torch.cat((dattr, isxod), 1)
                
                y = model.generate(inputs2)
                
                ccloss = model(input_ids=torch.cat((sattr, y), 1), decoder_input_ids=isxod, labels=isxod).loss
                # DAE
                
                nx = dae(comm)

                
                xemb = model.shared(nx)
                
                henc = model.encoder(inputs_embeds=xemb)
                z = henc.last_hidden_state[:, 0, :]
                
                z = model.dense(z)
                h = z.view(4, 1, 768)#768 for base, 512 for small
                emb = model.shared(torch.cat((sattr, isxod), 1))
                
                hdec = model.decoder(inputs_embeds=torch.add(emb, h))
                
                xdae = model.lm_head(hdec.last_hidden_state)
                xdae = xdae[:, (-1+(xdae.shape[1]-isxod.shape[1])):-1, :]
                
                daeloss = celoss(xdae[0], isxod[0])

                
                loss = lcc * ccloss + ldae * daeloss #LOSS
                
                loss.backward()
                
                optimizer.step()
                
                running_loss += loss.item()
        epoch_loss = running_loss / len(dataloader)

        print('Loss: ' + str(epoch_loss))
    return model

In [None]:
training(model, adamw_optimizer)

In [None]:
torch.save(model, 'T5selfsupervised3eps.pth')

In [None]:
import re
import string

In [None]:
df = pd.read_csv('../input/100comms/test.tsv', sep='\t')
df = df['toxic_comment']
#df = df.apply(lambda x: re.sub('\w*\d\w*', ' ', x))#Цифры
df = df.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))#Пунктуация
df = 'civil: '+df

In [None]:
def test_dataset(toxic: pd.Series, tokenizer,
                             DEVICE) -> TensorDataset:
    input_ids1 = []
    att_masks1 = []
    
    for tox in tqdm(toxic):
        encoded_dict1 = tokenizer.encode_plus(tox, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids1.append(encoded_dict1['input_ids'])
        att_masks1.append(encoded_dict1['attention_mask'])

    input_ids1 = torch.cat(input_ids1, dim=0)
    att_masks1 = torch.cat(att_masks1, dim=0)
  

    input_ids1.to(DEVICE, dtype=torch.long)
    att_masks1.to(DEVICE, dtype=torch.long)
    

    return TensorDataset(input_ids1, att_masks1) 

In [None]:
dataset = test_dataset(df, tokenizer, DEVICE)

In [None]:
bs = 1
test_loader = DataLoader(dataset, batch_size=bs, num_workers=0, shuffle=False)

In [None]:
predictions = []
model.eval()
with torch.no_grad():
       for batch in tqdm(test_loader):
                toxinps, toxmask = batch  
                toxinps, toxmask = toxinps.to(DEVICE), toxmask.to(DEVICE)

                generated_ids = model.generate(
                  input_ids = toxinps,
                  attention_mask = toxmask, 
                  max_length=150, 
                  num_beams=2,
                  repetition_penalty=2.5, 
                  length_penalty=1.0, 
                  early_stopping=True
                  )
                pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                predictions.append(pred)

In [None]:
df1 = pd.concat([df, pd.Series(predictions)], axis=1)

In [None]:
df1.columns = ['toxic', 'civil']

In [None]:
!pip install openpyxl

In [None]:
df1.to_excel('t5self-supervised3eps.xlsx')