In [1]:
!pip install simplet5



##Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch


##Loading the dataset and calling for 2 columns which will be used for the text summarizaion purpose.

In [3]:
df = pd.read_csv('news_summary.csv', encoding='latin1', usecols=['headlines', 'text'])


##Looking at the 1st few columns of the dataset

In [4]:
df.head()

Unnamed: 0,headlines,text
0,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...
1,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...
3,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...


##Renaming the columns

In [5]:
df = df.rename(columns={"headlines":"target_text", "text":"source_text"})

In [6]:
df = df[['source_text','target_text']]

In [7]:
df.head()

Unnamed: 0,source_text,target_text
0,The Administration of Union Territory Daman an...,Daman & Diu revokes mandatory Rakshabandhan in...
1,Malaika Arora slammed an Instagram user who tr...,Malaika slams user who trolled her for 'divorc...
2,The Indira Gandhi Institute of Medical Science...,'Virgin' now corrected to 'Unmarried' in IGIMS...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Aaj aapne pakad liya: LeT man Dujana before be...
4,Hotels in Maharashtra will train their staff t...,Hotel staff to get training to spot signs of s...


##T5 Data Preparation with Summarization Tax Prefix

In [10]:
df['source_text'] = "summarize: " + df['source_text']
df

Unnamed: 0,source_text,target_text
0,summarize: The Administration of Union Territo...,Daman & Diu revokes mandatory Rakshabandhan in...
1,summarize: Malaika Arora slammed an Instagram ...,Malaika slams user who trolled her for 'divorc...
2,summarize: The Indira Gandhi Institute of Medi...,'Virgin' now corrected to 'Unmarried' in IGIMS...
3,summarize: Lashkar-e-Taiba's Kashmir commander...,Aaj aapne pakad liya: LeT man Dujana before be...
4,summarize: Hotels in Maharashtra will train th...,Hotel staff to get training to spot signs of s...
...,...,...
4509,summarize: Fruit juice concentrate maker Rasna...,Rasna seeking ?250 cr revenue from snack categ...
4510,summarize: Former Indian cricketer Sachin Tend...,Sachin attends Rajya Sabha after questions on ...
4511,"summarize: Aamir Khan, while talking about rea...",Shouldn't rob their childhood: Aamir on kids r...
4512,summarize: The Maharashtra government has init...,"Asha Bhosle gets ?53,000 power bill for unused..."


##Preparing the dataset for the Training and Test

In [12]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.3)
train_df.shape, test_df.shape

((3159, 2), (1355, 2))

##Using SimpleT5 for Model Training

##Dowloading the Pre-Trained Model


In [13]:
from simplet5 import SimpleT5

In [14]:
model= SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

##Training the Model

In [15]:
model.train(train_df=train_df[:5000],
            eval_df=test_df[:100],
            source_max_token_len=128,
            target_max_token_len=50,
            batch_size=8, max_epochs=5, use_gpu=True)

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  self.pid = os.fork()
INFO:pytorch_lightning.utilities.seed:Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

##Output Folder Content

In [16]:
! ( cd outputs; ls )

simplet5-epoch-0-train-loss-1.5919-val-loss-1.2083
simplet5-epoch-1-train-loss-1.1623-val-loss-1.1656
simplet5-epoch-2-train-loss-0.9639-val-loss-1.2179
simplet5-epoch-3-train-loss-0.9787-val-loss-1.3298
simplet5-epoch-4-train-loss-0.7541-val-loss-1.3248


##Model Inferencing

In [18]:
# let's load the trained model from the local output folder for inferencing:
model.load_model("t5","outputs/simplet5-epoch-1-train-loss-1.1623-val-loss-1.1656", use_gpu=True)

##Testing the model

In [19]:
text_to_summarize=""""summarize: Twitter’s interim resident grievance officer for India has stepped down, leaving the micro-blogging site without a grievance official as mandated by the new IT rules to address complaints from Indian subscribers, according to a source.

The source said that Dharmendra Chatur, who was recently appointed as interim resident grievance officer for India by Twitter, has quit from the post.

The social media company’s website no longer displays his name, as required under Information Technology (Intermediary Guidelines and Digital Media Ethics Code) Rules 2021.

Twitter declined to comment on the development.

The development comes at a time when the micro-blogging platform has been engaged in a tussle with the Indian government over the new social media rules. The government has slammed Twitter for deliberate defiance and failure to comply with the country’s new IT rules."""
model.predict(text_to_summarize)

['Twitter’s interim grievance officer for India steps down: Source']

In [20]:
text_to_summarize="""summarize: Travellers vaccinated with Covishield may not be eligible for the European Union’s ‘Green Pass’ that will be available for use from July 1. Many EU member states have started issuing the digital “vaccine passport” that will enable Europeans to move freely for work or tourism. The immunity passport will serve as proof that a person has been vaccinated against the coronavirus disease (Covid-19), or recently tested negative for the virus, or has the natural immunity built up from earlier infection.Covishield, a version of AstraZeneca Covid vaccine manufactured by Pune-based Serum Institute of India (SII), has not been approved by the EMA for the European market. The EU green pass will only recognise the Vaxzervria version of the AstraZeneca vaccine that is manufactured in the UK or other sites around Europe.
"""
model.predict(text_to_summarize)

['Travellers vaccinated with Covishield not eligible for EU green pass']