In [5]:
import pandas as pd
from datasets import Dataset, load_dataset, load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score

df = pd.read_csv('edu_all.csv')
# max_len_text = df['source_article'].apply(len).idxmax()
# max_len_label =  df['updated_label'].apply(len).idxmax()

ds = Dataset.from_pandas(df)

model_checkpoint="google/t5-v1_1-base"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)
task_prefix = "Detect Fallacy: "

In [6]:
def tokenize(text):
    t5_prepared_text = task_prefix + text['source_article']
    tokenized_text = tokenizer(t5_prepared_text, padding="max_length", truncation=True, max_length=256)
    tokenized_labels = tokenizer(text['updated_label'], padding="max_length", truncation=True, max_length=22)
    tokenized_text['label'] = tokenized_labels['input_ids']

    return tokenized_text

print(tokenize(ds[0]))

{'input_ids': [3, 31636, 2589, 4710, 10, 96, 188, 11113, 398, 114, 24474, 250, 66, 872, 3567, 114, 24474, 535, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
enc_ds = ds.map(tokenize, load_from_cache_file=False)
enc_ds = enc_ds.remove_columns(["updated_label", 'original_url', 'old_label', 'source_article', 'explanations', 'rationale'])

100%|██████████| 2452/2452 [00:01<00:00, 2420.47ex/s]


In [8]:
enc_ds = enc_ds.train_test_split(test_size=0.1)
train_ds = enc_ds['train']
eval_ds = enc_ds['test']

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=2136,
)
training_args = TrainingArguments(
    output_dir="FallacyDetectorT5",
    evaluation_strategy="epoch",
    report_to="none",
    learning_rate=1e-4,
    save_steps=100,
    save_total_limit=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
)

In [10]:
trainer.train()
trainer.save_model("./FallacyDetectorT5/")

***** Running training *****
  Num examples = 2206
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 6618
  0%|          | 0/6618 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.00 GiB total capacity; 9.16 GiB already allocated; 0 bytes free; 9.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF