In [None]:
import wandb
from kaggle_secrets import UserSecretsClient
wandb_key = UserSecretsClient().get_secret("wannabe")

wandb.login(key=wandb_key)

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = 'facebook/bart-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Using device: cpu


In [3]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files='QAG_Train_wop.csv')

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'context'],
        num_rows: 2803
    })
})


In [4]:
def get_feature(batch):
    encodings = tokenizer(batch['question'], text_target=batch['context'], max_length=1024, truncation=True)
    encodings = {'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask'], 'labels': encodings['labels']}
    return encodings

dataset_enc = dataset.map(get_feature, batched=True)
columns=['input_ids', 'labels', 'attention_mask']
dataset_enc.set_format(type='torch',columns=columns)

In [6]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, 
                                       model=model)

training_args = TrainingArguments(
    run_name='bart-large-finetuning',
    output_dir='./results',
    logging_dir='./logs',
    num_train_epochs=8,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=100,
    eval_steps=100,
    save_steps=500,
    gradient_accumulation_steps=16
)
trainer = Trainer(model=model,
                args=training_args,
                processing_class=tokenizer,
                data_collator=data_collator,
                train_dataset=dataset_enc['train']
                )

trainer.train()

Step,Training Loss
100,5.9297
200,2.8821
300,2.7376
400,2.5402
500,2.4528
600,2.2112
700,2.1571
800,1.858
900,1.7588
1000,1.6436




TrainOutput(global_step=1400, training_loss=2.2768532998221263, metrics={'train_runtime': 54030.9177, 'train_samples_per_second': 0.415, 'train_steps_per_second': 0.026, 'total_flos': 531998784061440.0, 'train_loss': 2.2768532998221263, 'epoch': 7.95897252943275})

In [7]:
dialogue = "Amanda: I baked cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"
input_ids = tokenizer(dialogue, return_tensors='pt', truncation=True).input_ids
output = model.generate(input_ids)
summary = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Summary: {summary}")

Summary: Amanda: I baked cookies. Do you want some?Jerry: Sure, I would


In [None]:
model.save_pretrained("bart-baseline")