In [None]:
# ! pip install -qq transformers
# ! pip install -qq wandb
import pandas as pd
from transformers import (
                          AutoModelWithLMHead,
                          Trainer,
                          AutoTokenizer,
                          TextDataset,
                          DataCollatorForLanguageModeling,
                          TrainingArguments,)
import wandb

In [None]:
toxic1 = pd.read_csv('labeled.csv')
toxic1 = toxic1[toxic1.toxic > 0 ]
len(toxic1)

In [None]:
# https://www.kaggle.com/alexandersemiletov/starter-read-toxic-russian-comments-dataset
data_list = []
with open("dataset.txt") as file:
    for line in file:
        labels = line.split()[0]
        text = line[len(labels)+1:].strip()
        labels = labels.split(",")
        mask = [1 if "__label__NORMAL" in labels else 0,
                1 if "__label__INSULT" in labels else 0,
                1 if "__label__THREAT" in labels else 0,
                1 if "__label__OBSCENITY" in labels else 0]
        data_list.append((text, *mask))
toxic2 = pd.DataFrame(data_list, columns=["text", "normal", "insult", "threat", "obscenity"])
toxic2['toxic'] = toxic2[['insult','threat','obscenity']].sum(axis=1)
print(len(toxic2))
toxic2 = toxic2[toxic2.toxic > 0]
print(len(toxic2))

In [None]:
from sklearn.model_selection import train_test_split

toxic = pd.concat([toxic2['text'],toxic1['comment']])
toxic_train, toxic_val = train_test_split(toxic, test_size = 0.1)
toxic_train.to_csv('toxic_only_train.txt', sep='\n', index=False)
toxic_val.to_csv('toxic_only_val.txt', sep='\n', index=False)

In [None]:
cfg= {
    'text_path' :'/content/toxic_only_train.txt',
    'text_path_dev' : '/content/toxic_only_val.txt',
    'output_path': './models/gpt/',
    'tokenizer_output_path': './tokenizers/gpt',
    'block_size' : 128,
    'epochs' : 2,
    'batch_size' : 20,
    'warmup_steps' : 400,
    'save_steps' : 700,
    'logging_steps' : 100,
    'max_length' : 100,
    'model_name' : "sberbank-ai/rugpt3small_based_on_gpt2",
    'weight_decay' : 1e-6,
    'learning_rate' : 4e-5,
    'lr_scheduler_type' : 'cosine_with_restarts',
    }

In [None]:
training_args = TrainingArguments(
    output_dir = cfg['output_path'],
    num_train_epochs = cfg['epochs'],
    per_device_train_batch_size = cfg['batch_size'],
    warmup_steps=cfg['warmup_steps'],
    save_steps=cfg['save_steps'],
    logging_steps=cfg['logging_steps'],
    weight_decay = cfg['weight_decay'],
    lr_scheduler_type = cfg['lr_scheduler_type'],
    evaluation_strategy = "steps",
    learning_rate=cfg['learning_rate'],
    report_to="wandb"
)
  
run = wandb.init(project="detox_russe_gpt", config=cfg, entity="mikezz1")

model = AutoModelWithLMHead.from_pretrained(cfg['model_name'], max_length = cfg['max_length'])
tokenizer = AutoTokenizer.from_pretrained(cfg['model_name'])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = TextDataset(tokenizer=tokenizer,file_path=cfg['text_path'], block_size=cfg['block_size'])
dev_dataset = TextDataset(tokenizer=tokenizer,file_path=cfg['text_path_dev'], block_size=cfg['block_size']) 

trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=dev_dataset,
          )

In [None]:
trainer.train()

In [None]:
tokenizer.save_pretrained(cfg['output_path'])
model.save_pretrained(cfg['tokenizer_output_path'])