In [None]:
# ! pip install -qq transformers
# ! pip install -qq wandb
import pandas as pd
from transformers import (
                          AutoModelWithLMHead,
                          Trainer,
                          AutoTokenizer,
                          TextDataset,
                          DataCollatorForLanguageModeling,
                          TrainingArguments,)
import wandb

In [None]:
df = pd.read_csv('train.tsv', sep='\t').fillna('')
val_df = pd.read_csv('dev.tsv', sep='\t').fillna('')

In [None]:
def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Transforming dataframe to pairs toxic-neutral and adding special tokens for paraphrasing
    """
    df_toxic = []
    df_neutral = []
    for index, row in df.iterrows():
        references = row[['neutral_comment1', 'neutral_comment2',
                          'neutral_comment3']].tolist()
        for reference in references:
            if len(reference) > 0:
                df_toxic.append(row['toxic_comment'])
                df_neutral.append(reference)
            else:
                break
    df = pd.DataFrame({'toxic_comment': df_toxic, 'neutral_comment': df_neutral})
    df['input'] = '<s>' + df.neutral_comment + '</s>>>>><p>' + df.toxic_comment + '</p>'
    return df

df = prepare_df(df).sample(frac = 1)
val_df = prepare_df(val_df)
df['input'].to_csv('combined.txt', sep='\n', index=False)
val_df['input'].to_csv('combined_dev.txt', sep='\n', index=False)

In [None]:
cfg= {
    'text_path' :'/content/toxic_only_train.txt',
    'text_path_dev' : '/content/toxic_only_val.txt',
    'output_dir' : '/models/gpt/',
    'tokenizer_output_path': '/tokenizers/gpt',
    'block_size' : 128,
    'epochs' : 6,
    'batch_size' : 10,
    'warmup_steps' : 600,
    'save_steps' : 600,
    'logging_steps' : 100,
    'max_length' : 100,
    'model_path' : "/models/gpt/",#'/content/output/checkpoint-1800/',
    'tokenizer_path': 'tokenizers/gpt/',
    'weight_decay' : 1e-6,
    'learning_rate' : 3e-5,
    'lr_scheduler_type' : 'cosine_with_restarts',
    }

run = wandb.init(project="toxify", config=cfg, entity="username")

In [None]:
training_args = TrainingArguments(
    output_dir = cfg['output_dir'],
    num_train_epochs = cfg['epochs'],
    per_device_train_batch_size = cfg['batch_size'],
    warmup_steps=cfg['warmup_steps'],
    save_steps=cfg['save_steps'],
    logging_steps=cfg['logging_steps'],
    weight_decay = cfg['weight_decay'],
    lr_scheduler_type = cfg['lr_scheduler_type'],
    evaluation_strategy = "steps",
    learning_rate=cfg['learning_rate'],
    report_to="wandb"
)

model = AutoModelWithLMHead.from_pretrained(cfg['model_path'], max_length = cfg['max_length'])
tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer_path'])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = TextDataset(tokenizer=tokenizer,file_path=cfg['text_path'], block_size=cfg['block_size'])
dev_dataset = TextDataset(tokenizer=tokenizer,file_path=cfg['text_path_dev'], block_size=cfg['block_size']) 

trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=dev_dataset,
          )

In [None]:
trainer.train()

In [None]:
tokenizer.save_pretrained(cfg['output_dir'])
model.save_pretrained(cfg['tokenizer_output_path'])