In [1]:
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [31]:
import transformers
import pandas as pd


#Tokenizer
from transformers import RobertaTokenizerFast
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import RobertaTokenizer, RobertaForCausalLM


In [5]:
# Load the news articles into a pandas dataframe
df = pd.read_csv('100articles.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  100 non-null    int64  
 1   Unnamed: 0    100 non-null    int64  
 2   id            100 non-null    int64  
 3   title         100 non-null    object 
 4   publication   100 non-null    object 
 5   author        98 non-null     object 
 6   date          100 non-null    object 
 7   year          100 non-null    float64
 8   month         100 non-null    float64
 9   url           0 non-null      float64
 10  content       100 non-null    object 
dtypes: float64(3), int64(3), object(5)
memory usage: 8.7+ KB


# T-5 Pre-Trained Model

In [7]:
# Load the abstractive summarization model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
# Preprocess the news articles by encoding them as input_ids and attention_mask
inputs = tokenizer(df['content'][0], padding=True, truncation=True, max_length=512, return_tensors='pt')

# Generate summaries for each news article
outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Add the summaries to the dataframe
#df['summary'] = summaries

In [11]:
summaries

['a sudden loss of the disputed subsidies could conceivably cause the health care program to implode. that could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. a sudden loss of the disputed subsidies could conceivably cause the health care program to implode.']

In [12]:
df['content'][0]

'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been 

# BART Pre-Trained Model

In [15]:
# Load the abstractive summarization model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') 
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') 

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [16]:

# Preprocess the news articles by encoding them as input_ids and attention_mask
input_ids = tokenizer(df['content'][0], padding=True, truncation=True, max_length=1024, return_tensors='pt').input_ids

# Generate summaries for each news article
outputs = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Add the summaries to the dataframe
#df['summary'] = summaries

In [17]:
summaries

['House Republicans have a new fear when it comes to their health care lawsuit against the Obama administration. The incoming Trump administration could choose to no longer defend the executive branch. A sudden loss of the disputed subsidies could conceivably cause the health care program to implode. That could lead to chaos in the insurance market and spur a political backlash.']

# GPT-2 Pre-Trained Model

In [18]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the abstractive summarization model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [28]:
# Preprocess the news articles by encoding them as input_ids and attention_mask
inputs = tokenizer.encode_plus(df['content'][0], padding=True, truncation=True, max_length=150, return_tensors='pt')
input_ids = inputs['input_ids']

# Truncate the input_ids to match the target size
input_ids = input_ids[:,:150]

# Generate summaries for each news article
outputs = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]



summaries

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 150. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


RuntimeError: ignored

In [30]:
# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForCausalLM.from_pretrained('roberta-base')

# Preprocess the news articles by encoding them as input_ids and attention_mask
input_ids = tokenizer.encode(df['content'][0], padding='max_length', truncation=True, max_length=1024, return_tensors='pt')

# Generate summaries for each news article
outputs = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0)

# Decode the summaries from the output_ids
summaries = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
summaries

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Input length of input_ids is 1024, but `max_length` is set to 150. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


RuntimeError: ignored