In [1]:
import wandb
from kaggle_secrets import UserSecretsClient
wandb_key = UserSecretsClient().get_secret("wannabe")
wandb.login(key=wandb_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mngjabach[0m ([33mngjabach-hanoi-university-of-science-and-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = 'facebook/bart-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Using device: cuda


In [3]:
import pandas as pd

class TokenDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])

def tokenize_data(train_question, train_context):
    encodings = tokenizer(train_context, truncation=True, padding=True, max_length = 512)
    decodings = tokenizer(train_question, truncation=True, padding=True, max_length = 512)
    dataset_tokenized = TokenDataset(encodings, decodings)
    return dataset_tokenized

df = pd.read_csv("/kaggle/input/qag-wop/QAG_Train_wop.csv")
df.rename(columns = {'question':'question', 'context':'context'}, inplace = True)
df.keys()

train_question, train_context = (list(df['question'])), (list(df['context']))
train_data = tokenize_data(train_question, train_context)

print(train_data)

<__main__.TokenDataset object at 0x7aceb8f0d720>


In [4]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    run_name='bart-large-finetuning',
    output_dir='./results',
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=2000,
    logging_dir=None,
    per_device_train_batch_size=1,
    num_train_epochs=8,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_data
)

trainer.train()

model.save_pretrained("bart-baseline")

Step,Training Loss
2000,1.2313
4000,0.2969
6000,0.2699
8000,0.2112
10000,0.1631
12000,0.1105
14000,0.0905
16000,0.0597
18000,0.05
20000,0.039




In [10]:
dialogue = "Fox running over white fox."

input_ids = tokenizer(dialogue, return_tensors='pt', 
                      max_length=1024, truncation=True).input_ids.to(device)
output = model.generate(input_ids, max_length=1024, early_stopping=False)

summary = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Summary: {summary}")

Summary: What do you mean by fox running over white fox?


In [7]:
%cd /kaggle/working
from IPython.display import FileLink
FileLink('bart-baseline/model.safetensors')

/kaggle/working
