In [2]:
!pip install transformers --quiet
!pip install sacremoses --quiet

In [3]:
import transformers
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizerFast
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import RobertaTokenizer, RobertaForCausalLM
from transformers import XLMTokenizer, XLMWithLMHeadModel

In [4]:
# Load the news articles into a pandas dataframe
df = pd.read_csv('100articles.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  100 non-null    int64  
 1   Unnamed: 0    100 non-null    int64  
 2   id            100 non-null    int64  
 3   title         100 non-null    object 
 4   publication   100 non-null    object 
 5   author        98 non-null     object 
 6   date          100 non-null    object 
 7   year          100 non-null    float64
 8   month         100 non-null    float64
 9   url           0 non-null      float64
 10  content       100 non-null    object 
dtypes: float64(3), int64(3), object(5)
memory usage: 8.7+ KB


In [10]:
#we will use this article at index 0 to see the different summarization models
df['content'][0]

'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been 

# T-5 Pre-Trained Model

In [14]:
# Load the abstractive summarization model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [9]:
# Preprocess the news articles by encoding them as input_ids and attention_mask
inputs = tokenizer(df['content'][0], padding=True, truncation=True, max_length=512, return_tensors='pt')

# Generate summaries for each news article
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

summaries

['a sudden loss of the disputed subsidies could conceivably cause the health care program to implode. that could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. a sudden loss of the disputed subsidies could conceivably cause the health care program to implode.']

# BART Pre-Trained Model

In [15]:
# Load the abstractive summarization model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') 
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') 

In [12]:

# Preprocess the news articles by encoding them as input_ids and attention_mask
input_ids = tokenizer(df['content'][0], padding=True, truncation=True, max_length=1024, return_tensors='pt').input_ids

# Generate summaries for each news article
outputs = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

summaries

['House Republicans have a new fear when it comes to their health care lawsuit against the Obama administration. The incoming Trump administration could choose to no longer defend the executive branch. A sudden loss of the disputed subsidies could conceivably cause the health care program to implode. That could lead to chaos in the insurance market and spur a political backlash.']

# XML Transformers

In [5]:
# Load the tokenizer and model
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-tlm-xnli15-1024')
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-tlm-xnli15-1024')

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-tlm-xnli15-1024 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:

# Set max_length
max_length = 512

# Split text into chunks
text = df['content'][0]
text_chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]


# Summarize each chunk
summaries = []
for chunk in text_chunks:
    inputs = tokenizer.batch_encode_plus([chunk], return_tensors='pt', max_length=max_length)
    summary_ids = model.generate(inputs['input_ids'], early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summaries.append(summary)

# Combine summaries
XLM_summary = ' '.join(summaries)

print(XLM_summary)

Input length of input_ids is 102, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 112, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 117, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 121, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 113, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 119, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 118, but `max_length` is set to 20. This can lead to unexpe

washington - congressional republicans have a new fear when it comes to their health care lawsuit against the obama administration : they might win. the incoming trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration's authority to spend billions of dollars on health insurance subsidies for and americans, handing house republicans a big victory on issues. but a sudden loss of the disputed subsidies could conceivably cause obama the health care program to implode, leaving millions of people without access to health insurance before republicans have prepared a replacement. that could lead to chaos in the insurance market and spur a political backlash just as republicans gain full control of the government. to stave off that outcome, republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the obama health care law, angering conservative voters who have been demandi