In [2]:
!pip install transformers --quiet
!pip install sacremoses --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
import transformers
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizerFast
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import RobertaTokenizer, RobertaForCausalLM
from transformers import XLMTokenizer, XLMWithLMHeadModel

In [4]:
# Load the news articles into a pandas dataframe
df = pd.read_csv('100articles.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  100 non-null    int64  
 1   Unnamed: 0    100 non-null    int64  
 2   id            100 non-null    int64  
 3   title         100 non-null    object 
 4   publication   100 non-null    object 
 5   author        98 non-null     object 
 6   date          100 non-null    object 
 7   year          100 non-null    float64
 8   month         100 non-null    float64
 9   url           0 non-null      float64
 10  content       100 non-null    object 
dtypes: float64(3), int64(3), object(5)
memory usage: 8.7+ KB


In [5]:
#we will use this article at index 0 to see the different summarization models
print('length of the article :',len(df['content'][0]))
print(df['content'][0])

length of the article : 5607
WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conser

# T-5 Pre-Trained Model

In [None]:
# Load the abstractive summarization model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [None]:
# Preprocess the news articles by encoding them as input_ids and attention_mask
inputs = tokenizer(df['content'][0], padding=True, truncation=True, max_length=512, return_tensors='pt')

# Generate summaries for each news article
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

print('Length of the summarized article:',len(summaries[0]))
print(summaries)

Length of the summarized article: 335
['a sudden loss of the disputed subsidies could conceivably cause the health care program to implode. that could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. a sudden loss of the disputed subsidies could conceivably cause the health care program to implode.']


# BART Pre-Trained Model

In [None]:
# Load the abstractive summarization model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') 
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') 

In [None]:

# Preprocess the news articles by encoding them as input_ids and attention_mask
input_ids = tokenizer(df['content'][0], padding=True, truncation=True, max_length=1024, return_tensors='pt').input_ids

# Generate summaries for each news article
outputs = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

print('Length of the summarized article:',len(summaries[0]))
print(summaries)

Length of the summarized article: 379
['House Republicans have a new fear when it comes to their health care lawsuit against the Obama administration. The incoming Trump administration could choose to no longer defend the executive branch. A sudden loss of the disputed subsidies could conceivably cause the health care program to implode. That could lead to chaos in the insurance market and spur a political backlash.']
