In [2]:
!pip install transformers[sentencepiece] sacrebleu py7zr -q
!pip install datasets
!pip install torch



In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import py7zr

In [4]:
dataset=load_dataset('samsum')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [5]:
tokenizer=AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
model=AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')

In [6]:
def convertTextToFeatures(example):
    dialogue_embeddings=tokenizer(example['dialogue'],max_length=1024,truncation=True)
    summary_embeddings=tokenizer(example['summary'],max_length=128,truncation=True)
    return {
        'input_ids':dialogue_embeddings['input_ids'],
        'attention_mask':dialogue_embeddings['attention_mask'],
        'labels':summary_embeddings['input_ids']
    }

In [7]:
dataset_tokenized=dataset.map(convertTextToFeatures)
dataset_tokenized

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [8]:
from transformers import DataCollatorForSeq2Seq
dataseq2seq=DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)

In [None]:
# use this function to calculate loss during training and testing
import evaluate

rouge = evaluate.load("rouge")
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    processing_class=tokenizer,
    data_collator=dataseq2seq,
)


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmuneebzain484[0m ([33mmuneebzain484-university-of-the-punjab[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
0,No log,1.395884
2,1.365400,1.376164


