In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import os
import random
from tqdm import tqdm
from torch.utils.data import TensorDataset
import re
import string
from transformers import AdamW

In [None]:
def seed_everything(seed = 1234):
     random.seed(seed)
     os.environ['PYTHONHASHSEED'] = str(seed)
     np.random.seed(seed)     
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.backends.cudnn.deterministic = True

In [None]:
seed_everything()

In [None]:
raw_model = 'sberbank-ai/ruT5-base'
model = T5ForConditionalGeneration.from_pretrained(raw_model, output_hidden_states=False)
tokenizer = T5Tokenizer.from_pretrained(raw_model)

In [None]:
df = pd.read_csv('../input/toxic-russian-comments-from-pikabu-and-2ch/russian_comments_from_2ch_pikabu.csv')
df = df[['comment', 'toxic']]
df["comment"] = df["comment"].apply(lambda x: re.sub('\w*\d\w*', ' ', x))
df["comment"] = df["comment"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))
df['comment'] = 'classify: '+df['comment']
df['toxic'] = df['toxic'].replace({1:'toxic', 0:'civil'})

In [None]:
DEVICE = 'cuda:0'

In [None]:
def convert_to_dataset_torch(comment: pd.Series, toxic: pd.Series, tokenizer,
                             DEVICE) -> TensorDataset:
    input_ids1 = []
    input_ids2 = []
    att_masks1 = []
    att_masks2 = []
    
    for comm, tox in tqdm(zip(comment, toxic)):
        encoded_dict1 = tokenizer.encode_plus(comm, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids1.append(encoded_dict1['input_ids'])
        att_masks1.append(encoded_dict1['attention_mask'])

        encoded_dict2 = tokenizer.encode_plus(tox, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids2.append(encoded_dict2['input_ids'])
        att_masks2.append(encoded_dict2['attention_mask'])


    input_ids1 = torch.cat(input_ids1, dim=0)
    input_ids2 = torch.cat(input_ids2, dim=0)
    att_masks1 = torch.cat(att_masks1, dim=0)
    att_masks2 = torch.cat(att_masks2, dim=0)
  

    input_ids1.to(dtype=torch.long)
    input_ids2.to(dtype=torch.long)
    att_masks1.to(dtype=torch.long)
    att_masks2.to(dtype=torch.long)
    

    return TensorDataset(input_ids1, input_ids2, att_masks1, att_masks2) 

In [None]:
dataset = convert_to_dataset_torch(df.comment, df.toxic, tokenizer, DEVICE)

In [None]:
bs = 4
loader = DataLoader(dataset, batch_size=bs, num_workers=0, shuffle=True)
adamw_optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

In [None]:
def training(model, optimizer):
    model.to(DEVICE)
    model.train()
    dataloader = loader
    running_loss = 0.
    epochs = 1
    for epoch in range(epochs):
        print(epoch)
        for batch in tqdm(dataloader):
                toxinps, civinps, toxmask, civmask = batch  
                toxinps, civinps, toxmask, civmask = toxinps.to(DEVICE), civinps.to(DEVICE), toxmask.to(DEVICE), civmask.to(DEVICE)
#                 y = civinps
#                 y_ids = y[:, :-1].contiguous()
#                 lm_labels = y[:, 1:].clone().detach()
#                 lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
                loss = model(input_ids = toxinps, attention_mask = toxmask, decoder_attention_mask=civmask, labels=civinps).loss
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
        epoch_loss = running_loss / len(dataloader)

        print('Loss: ' + str(epoch_loss))
    return model

In [None]:
training(model, adamw_optimizer)

In [None]:
model.eval()
text = 'classify: ты самый красивый'
comm = tokenizer.encode_plus(text, return_attention_mask=True, return_tensors='pt')
comm.input_ids = comm.input_ids.to(DEVICE)
comm.attention_mask = comm.attention_mask.to(DEVICE)

generated_ids = model.generate(
              input_ids = comm.input_ids,
              attention_mask = comm.attention_mask, 
              max_length=150, 
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
#generated_ids
tokenizer.decode(generated_ids[0])

In [None]:
df = pd.read_csv('../input/labeled-toxic-comments/train.tsv', sep='\t')
df = df[['toxic_comment', 'neutral_comment1']]
df.dropna(inplace=True)
df.columns = ['toxic', 'civil']
df["toxic"] = df["toxic"].apply(lambda x: re.sub('\w*\d\w*', ' ', x))
df["toxic"] = df["toxic"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))
df["civil"] = df["civil"].apply(lambda x: re.sub('\w*\d\w*', ' ', x))
df["civil"] = df["civil"].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))
df['toxic'] = 'detox: '+df['toxic']

In [None]:
def to_torch_dataset(toxic: pd.Series, civil: pd.Series, tokenizer,
                             DEVICE) -> TensorDataset:
    input_ids1 = []
    input_ids2 = []
    att_masks1 = []
    att_masks2 = []
    
    for tox, civ in tqdm(zip(toxic, civil)):
        encoded_dict1 = tokenizer.encode_plus(tox, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids1.append(encoded_dict1['input_ids'])
        att_masks1.append(encoded_dict1['attention_mask'])

        encoded_dict2 = tokenizer.encode_plus(civ, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids2.append(encoded_dict2['input_ids'])
        att_masks2.append(encoded_dict2['attention_mask'])


    input_ids1 = torch.cat(input_ids1, dim=0)
    input_ids2 = torch.cat(input_ids2, dim=0)
    att_masks1 = torch.cat(att_masks1, dim=0)
    att_masks2 = torch.cat(att_masks2, dim=0)
  

    input_ids1.to(dtype=torch.long)
    input_ids2.to(dtype=torch.long)
    att_masks1.to(dtype=torch.long)
    att_masks2.to(dtype=torch.long)
    

    return TensorDataset(input_ids1, input_ids2, att_masks1, att_masks2) 

In [None]:
dataset = to_torch_dataset(df.toxic, df.civil, tokenizer, DEVICE)

In [None]:
bs = 2
loader = DataLoader(dataset, batch_size=bs, num_workers=0, shuffle=True)
adamw_optim = AdamW(model.parameters(), lr=3e-5, eps=1e-8)

In [None]:
def training_detox(model, optimizer):
    model.to(DEVICE)
    #model.train()
    
    dataloader = loader
    running_loss = 0.
    epochs = 3
    for epoch in range(epochs):
        print(epoch)
        for batch in tqdm(dataloader):
                toxinps, civinps, toxmask, civmask = batch  
                toxinps, civinps, toxmask, civmask = toxinps.to(DEVICE), civinps.to(DEVICE), toxmask.to(DEVICE), civmask.to(DEVICE)
                y = civinps
                y_ids = y[:, :-1].contiguous()
                lm_labels = y[:, 1:].clone().detach()
                lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
                loss = model(input_ids = toxinps, attention_mask = toxmask, decoder_input_ids=y_ids, labels=lm_labels).loss
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
        epoch_loss = running_loss / len(dataloader)

        print('Loss: ' + str(epoch_loss))
    return model

In [None]:
training_detox(model, adamw_optim)

In [None]:
model.eval()
text = 'detox: блять иди ты нахуй, я твою мать ебал'
comm = tokenizer.encode_plus(text, return_attention_mask=True, return_tensors='pt')
comm.input_ids = comm.input_ids.to(DEVICE)
comm.attention_mask = comm.attention_mask.to(DEVICE)

generated_ids = model.generate(
              input_ids = comm.input_ids,
              attention_mask = comm.attention_mask, 
              max_length=150, 
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
#generated_ids
tokenizer.decode(generated_ids[0])

In [None]:
df = pd.read_csv('../input/labeled-toxic-comments/test.tsv', sep='\t')
df = df['toxic_comment']
#df = df.apply(lambda x: re.sub('\w*\d\w*', ' ', x))#Цифры
df = df.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x))#Пунктуация
df = 'detox: '+df

In [None]:
def test_dataset(toxic: pd.Series, tokenizer,
                             DEVICE) -> TensorDataset:
    input_ids1 = []
    att_masks1 = []
    
    for tox in tqdm(toxic):
        encoded_dict1 = tokenizer.encode_plus(tox, max_length=512,
                                              pad_to_max_length=True,
                                              return_attention_mask=True, return_tensors='pt', truncation=True)
        input_ids1.append(encoded_dict1['input_ids'])
        att_masks1.append(encoded_dict1['attention_mask'])

    input_ids1 = torch.cat(input_ids1, dim=0)
    att_masks1 = torch.cat(att_masks1, dim=0)
  

    input_ids1.to(DEVICE, dtype=torch.long)
    att_masks1.to(DEVICE, dtype=torch.long)
    

    return TensorDataset(input_ids1, att_masks1) 

In [None]:
dataset = test_dataset(df, tokenizer, DEVICE)

In [None]:
bs = 1
test_loader = DataLoader(dataset, batch_size=bs, num_workers=0, shuffle=False)

In [None]:
predictions = []
model.eval()
with torch.no_grad():
       for batch in tqdm(test_loader):
                toxinps, toxmask = batch  
                toxinps, toxmask = toxinps.to(DEVICE), toxmask.to(DEVICE)

                generated_ids = model.generate(
                  input_ids = toxinps,
                  attention_mask = toxmask, 
                  max_length=150, 
                  num_beams=2,
                  repetition_penalty=2.5, 
                  length_penalty=1.0, 
                  early_stopping=True
                  )
                pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                predictions.append(pred)

In [None]:
pd.Series(predictions)

In [None]:
df1 = pd.concat([df, pd.Series(predictions)], axis=1)

In [None]:
df1.columns = ['toxic', 'civil']

In [None]:
!pip install openpyxl

In [None]:
df1.to_excel('t5supervisedwithpretrain.xlsx')

In [None]:
torch.save(model, 'modeldetox.pth')