# Bart-paraphrase

In [1]:
import pandas as pd
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [5]:
df = pd.read_csv("../data/raw/suitable.csv", index_col=0)
df.head()

Unnamed: 0,toxic,non-toxic
5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...
6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.
13,"Come on, Cal, leave that shit alone.","come on, Cal, put it down."
22,"Real life starts the first time you fuck, kid.","boy, real life starts up first."
25,"Shit, this one I can't even pronounce.","gosh, I can't even pronounce this."


In [6]:
seed = 177013
train = df.sample(10000, random_state=seed)
val = df.drop(train.index).sample(1000, random_state=seed)
print(len(train), len(val))

10000 1000


## Model training

In [14]:
model_name = "eugenesiow/bart-paraphrase"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [30]:
def tokenize_function(data):
    inputs = ["paraphrase to be nontoxic: \n" + text for text in data["toxic"]]
    targets = data["non-toxic"]
    return tokenizer(inputs, text_target=targets, max_length=64, truncation=True)

train_dataset = Dataset.from_pandas(train).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val).map(tokenize_function, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [32]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../models/bart",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_strategy="no",
    num_train_epochs=10,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.955216
2,0.348700,3.058694
3,0.348700,3.259816
4,0.242000,3.297467
5,0.160400,3.559584
6,0.160400,3.425686
7,0.106700,3.573632
8,0.070000,3.589448
9,0.070000,3.651816
10,0.046300,3.711815


TrainOutput(global_step=3130, training_loss=0.157196182031601, metrics={'train_runtime': 2030.6712, 'train_samples_per_second': 49.245, 'train_steps_per_second': 1.541, 'total_flos': 1.01777729028096e+16, 'train_loss': 0.157196182031601, 'epoch': 10.0})

In [33]:
trainer.save_model("../models/bart")

## Check model

In [2]:
my_model = AutoModelForSeq2SeqLM.from_pretrained("../models/bart")
my_tokenizer = AutoTokenizer.from_pretrained("../models/bart")

In [3]:
my_model.eval()
my_model.config.use_cache = False

In [4]:
def generate(model, tokenizer, prompt):
    prompt = "paraphrase to be nontoxic: \n" + prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


generate(my_model, my_tokenizer, "You stupid bastards, I said to you, go to the fucking mid. Retard!")

'I said to you, go to the mid.'

In [7]:
test = df.drop(train.index).drop(val.index).sample(1000, random_state=seed)
print(len(test))

1000


In [8]:
test.head()

Unnamed: 0,toxic,non-toxic
399522,I'll show you who's got bigger butts on this s...,I'll show you who's got The bigger bottom roun...
32832,"""I sit on the floor and pick my nose and think...","""I sit down and I pick my nose, and on my nose..."
126205,did you know I really thought this grease monk...,"You know, I thought that greaser really did tu..."
151854,"Just go, tyler, get the hell away from me.","go, Tyler, get away from me."
218096,"'You're putrid, you know that?' Danny said.","""you're cute, you know?"" Said Danny."


In [9]:
test['generated'] = test['toxic'].map(lambda prompt: generate(my_model, my_tokenizer, prompt))
test.head()

Unnamed: 0,toxic,non-toxic,generated
399522,I'll show you who's got bigger butts on this s...,I'll show you who's got The bigger bottom roun...,I'll show you who's got more cigarettes on thi...
32832,"""I sit on the floor and pick my nose and think...","""I sit down and I pick my nose, and on my nose...","""I sit on the floor and pick at people's shoes..."
126205,did you know I really thought this grease monk...,"You know, I thought that greaser really did tu...",Did you know I really thought that this crooke...
151854,"Just go, tyler, get the hell away from me.","go, Tyler, get away from me.","go, tyler, get away from me."
218096,"'You're putrid, you know that?' Danny said.","""you're cute, you know?"" Said Danny.","""you're in trouble, you know that?"" Danny said."


## Save results

In [10]:
test.to_csv("../data/interim/bart_pred.csv")