# Import All Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import accelerate

sns.set()

  from .autonotebook import tqdm as notebook_tqdm


# Ignore All Warnings

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Load the DataSet

Note that the dataset has been found on Hugging Face at the link: https://huggingface.co/datasets/multi_news

In [12]:
from datasets import load_dataset

dataset = load_dataset("multi_news", split = "test")

# Analyze and Split the Data

In [13]:
temp = dataset.to_pandas()
temp.head(3)

Unnamed: 0,document,summary
0,GOP Eyes Gains As Voters In 11 States Pick Gov...,– It's a race for the governor's mansion in 11...
1,\n \n \n \n UPDATE: 4/19/2001 Read Richard Met...,– It turns out Facebook is only guilty of abou...
2,It's the Golden State's latest version of the ...,– Not a big fan of Southern California? Neithe...


In [18]:
data = dataset.train_test_split(test_size = 0.2)

# Load the t5 pretrained Model

This model has been extracted using the Hugging Face Library as well. Check it out on: https://huggingface.co/DunnBC22/flan-t5-base-text_summarization_data

In [14]:
tokenizer = AutoTokenizer.from_pretrained("DunnBC22/flan-t5-base-text_summarization_data")
model = AutoModelForSeq2SeqLM.from_pretrained("DunnBC22/flan-t5-base-text_summarization_data")

In [41]:
tokenizer.save_pretrained("./models/")

('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/tokenizer.json')

In [43]:
torch.save(model.state_dict(), './model.pth')

In [8]:
model = torch.load('./model.pth')
tokenizer = AutoTokenizer.from_pretrained("./models/")

In [15]:
def prepfunc(testval):

    inputs = ["Summ: " + doc for doc in testval["document"]]

    inputs = tokenizer(inputs, max_length=5024, truncation=True)
    labels = tokenizer(text=testval["summary"], max_length=500, truncation=True)
    inputs["labels"] = labels["input_ids"]

    return inputs

In [19]:
tokenized_data = data.map(prepfunc, batched=True)

Map:   0%|          | 0/4497 [00:00<?, ? examples/s]

Map: 100%|██████████| 4497/4497 [00:53<00:00, 84.17 examples/s] 
Map: 100%|██████████| 1125/1125 [01:02<00:00, 17.93 examples/s]


# Hyperparameter Tuning

In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./res",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    seed= 42,
    lr_scheduler_type= "linear",
)

# Training

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='DunnBC22/flan-t5-base-text_summarization_data')

In [28]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [29]:
trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 14.70 GiB (GPU 0; 8.00 GiB total capacity; 4.78 GiB already allocated; 1.29 GiB free; 4.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF