# Import All Libraries

In [1]:
!pip install datasets



In [2]:
!pip install transformers
!pip install accelerate



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import os
from torch import nn
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import accelerate

sns.set()

In [4]:
print(torch.cuda.is_available())
torch.cuda.get_device_name(0)

True


'Tesla T4'

# Ignore All Warnings

In [5]:
import warnings
warnings.filterwarnings("ignore")

# Load the DataSet

Note that the dataset has been found on Hugging Face at the link: https://huggingface.co/datasets/multi_news

In [6]:
from datasets import load_dataset

dataset = load_dataset("multi_news", split = "test")

# Analyze and Split the Data

In [7]:
temp = dataset.to_pandas()
temp.head(3)

Unnamed: 0,document,summary
0,GOP Eyes Gains As Voters In 11 States Pick Gov...,– It's a race for the governor's mansion in 11...
1,\n \n \n \n UPDATE: 4/19/2001 Read Richard Met...,– It turns out Facebook is only guilty of abou...
2,It's the Golden State's latest version of the ...,– Not a big fan of Southern California? Neithe...


In [8]:
data = dataset.train_test_split(test_size = 0.2)

# Load the t5 pretrained Model

This model has been extracted using the Hugging Face Library as well. Check it out on: https://huggingface.co/DunnBC22/flan-t5-base-text_summarization_data

In [9]:
tokenizer = AutoTokenizer.from_pretrained("DunnBC22/flan-t5-base-text_summarization_data")

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='DunnBC22/flan-t5-base-text_summarization_data')

In [11]:
def prepfunc(testval):

    inputs = ["Summ: " + doc for doc in testval["document"]]
    inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text=testval["summary"], max_length=124, truncation=True)
    inputs["labels"] = labels["input_ids"]

    return inputs

In [12]:
tokenized_data = data.map(prepfunc, batched=True)

Map:   0%|          | 0/4497 [00:00<?, ? examples/s]

Map:   0%|          | 0/1125 [00:00<?, ? examples/s]

In [13]:
model = AutoModelForSeq2SeqLM.from_pretrained("DunnBC22/flan-t5-base-text_summarization_data")

# Hyperparameter Tuning

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./res",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=True,
    seed = 42,
    push_to_hub = True,
)

# Training

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [16]:
#torch.cuda.empty_cache()

In [17]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,


TrainOutput(global_step=1800, training_loss=0.0, metrics={'train_runtime': 1955.9439, 'train_samples_per_second': 4.598, 'train_steps_per_second': 0.92, 'total_flos': 1.2316505812475904e+16, 'train_loss': 0.0, 'epoch': 2.0})

In [18]:
trainer.save_model("./model/")

In [19]:
document = "National Archives Yes, it's that time again, folks. It's the first Friday of the month, when for one ever-so-brief moment the interests of Wall Street, Washington and Main Street are all aligned on one thing: Jobs. A fresh update on the U.S. employment situation for January hits the wires at 8:30 a.m. New York time offering one of the most important snapshots on how the economy fared during the previous month. Expectations are for 203,000 new jobs to be created, according to economists polled by Dow Jones Newswires, compared to 227,000 jobs added in February. The unemployment rate is expected to hold steady at 8.3%. Here at MarketBeat HQ, we’ll be offering color commentary before and after the data crosses the wires. Feel free to weigh-in yourself, via the comments section. And while you're here, why don't you sign up to follow us on Twitter. Enjoy the show. ||||| Employers pulled back sharply on hiring last month, a reminder that the U.S. economy may not be growing fast enough to sustain robust job growth. The unemployment rate dipped, but mostly because more Americans stopped looking for work. The Labor Department says the economy added 120,000 jobs in March, down from more than 200,000 in each of the previous three months. The unemployment rate fell to 8.2 percent, the lowest since January 2009. The rate dropped because fewer people searched for jobs. The official unemployment tally only includes those seeking work. The economy has added 858,000 jobs since December _ the best four months of hiring in two years. But Federal Reserve Chairman Ben Bernanke has cautioned that the current hiring pace is unlikely to continue without more consumer spending."
human_summary = """– The unemployment rate dropped to 8.2% last month, but the economy only added 120,000 jobs, when 203,000 new jobs had been predicted, according to today's jobs report. Reaction on the Wall Street Journal's MarketBeat Blog was swift: "Woah!!! Bad number." The unemployment rate, however, is better news; it had been expected to hold steady at 8.3%. But the AP notes that the dip is mostly due to more Americans giving up on seeking employment."""

In [20]:
def predict_summary(document):
  device = model.device
  tokenized = tokenizer([document], truncation =True, padding ='longest',return_tensors='pt')
  tokenized = {k: v.to(device) for k, v in tokenized.items()}
  tokenized_result = model.generate(**tokenized, max_length=124)
  tokenized_result = tokenized_result.to('cpu')
  predicted_summary = tokenizer.decode(tokenized_result[0])
  return predicted_summary

In [21]:
human_summary

'– The unemployment rate dropped to 8.2% last month, but the economy only added 120,000 jobs, when 203,000 new jobs had been predicted, according to today\'s jobs report. Reaction on the Wall Street Journal\'s MarketBeat Blog was swift: "Woah!!! Bad number." The unemployment rate, however, is better news; it had been expected to hold steady at 8.3%. But the AP notes that the dip is mostly due to more Americans giving up on seeking employment.'

In [22]:
predicted_summary = predict_summary(document)
predicted_summary

"<pad> – The unemployment rate fell sharply in March, but the numbers aren't so bad: The unemployment rate fell to 8.2%, the lowest since January 2009. The unemployment rate is expected to hold steady at 8.3%, reports MarketBeat. The unemployment rate is expected to hold steady at 8.3%, reports the Wall Street Journal. The economy added 120,000 jobs in March, down from more than 200,000 in each of the previous three months. The unemployment rate dropped to 8.2%, the lowest since January 2009. The official unemployment tally only includes those seeking work. The economy has added 858,000"

In [24]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install huggingface_hub



In [43]:
from huggingface_hub import notebook_login

In [44]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [48]:
trainer.push_to_hub("Hari93/t5-data-on-multi-news")

Cloning https://huggingface.co/Hari93/res into local empty directory.


Upload file pytorch_model.bin:   0%|          | 1.00/945M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/4.00k [00:00<?, ?B/s]

To https://huggingface.co/Hari93/res
   f5c063e..c3b9390  main -> main

   f5c063e..c3b9390  main -> main

To https://huggingface.co/Hari93/res
   c3b9390..81ac2e0  main -> main

   c3b9390..81ac2e0  main -> main



'https://huggingface.co/Hari93/res/commit/c3b9390e035e45330cc4946dd99076976bb56094'