In [173]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [174]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# The CNN/DailyMail **Dataset**

In [175]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")

print(f"Features in cnn_dailymail : {dataset['train'].column_names}")

Features in cnn_dailymail : ['article', 'highlights', 'id']


In [176]:
sample = dataset["train"][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])


Article (excerpt of 500 characters, total length: 4051):

Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


# **SAMPLE DATASET FOR 1000 ROWS**

# **Text Summarization Pipelines**

In [177]:
sample_text = dataset["train"][1]["article"][:1000]

# We'll collect the generated summaries of each model in a dictionary
summaries = {}

# **GPT 2**

In [178]:
from transformers import pipeline, set_seed

set_seed(42)

pipe = pipeline('text-generation', model = 'gpt2-medium' )

gpt2_query = sample_text + "\nTL;DR:\n"

pipe_out = pipe(gpt2_query, max_length = 512, clean_up_tokenization_spaces = True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [179]:
pipe_out

[{'generated_text': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people often won\'t do what they\'re told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid, delusional, and

In [180]:
pipe_out[0]["generated_text"][len(gpt2_query) :]

'The Eighth Amendment of the U.S. Constitution protects citizens from unreasonable search & seizure. This is what Leifman said in a 2012 hearing before Florida\'s Florida Supreme Court when questioned by judge Patricia Smith to address the problem of mentally ill people being arrested by police. That hearing occurred three months after a local judge in Miami ordered the state of Florida to stop using police officers as stand-by for all mentally ill people, something the U.S. Supreme Court ruled as unconstitutional a month before. "I think I\'ve made it clear that the Fourth Amendment prohibits police officers from using non-law enforcement officers," Leifman said. "And I believe in common law -- and I\'m proud of this fact -- in common law, it is unlawful for police officers to serve as stand-bys for a mentally ill person." He noted that the Fourth Amendment was intended to stop a person suffering involuntary manslaughter when his or her own death threatened to occur as a result of the

In [181]:
summaries['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

# **BART**

In [182]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)

In [183]:
pipe_out

[{'summary_text': 'Miami-Dade pretrial detention facility is dubbed the "forgotten floor" Here, inmates with the most severe mental illnesses are incarcerated. Most often, they face drug charges or charges of assaulting an officer. Judge Steven Leifman says the arrests often result from confrontations with police.'}]

In [184]:
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [185]:
summaries["bart"]

'Miami-Dade pretrial detention facility is dubbed the "forgotten floor" Here, inmates with the most severe mental illnesses are incarcerated.\nMost often, they face drug charges or charges of assaulting an officer.\nJudge Steven Leifman says the arrests often result from confrontations with police.'

# **PEGASUS**

In [186]:
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail"  )

pipe_out = pipe(sample_text)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [187]:
pipe_out

[{'summary_text': 'Mentally ill inmates are housed on the "forgotten floor" of a Miami jail .<n>Judge Steven Leifman says the charges are usually "avoidable felonies"<n>He says the arrests often result from confrontations with police .<n>Mentally ill people often won\'t do what they\'re told when police arrive on the scene .'}]

In [188]:
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

## **Comparing Different Summaries**

In [189]:
print("GROUND TRUTH")

print(dataset['train'][1]['highlights'])


for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])

GROUND TRUTH
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .
GPT2
The Eighth Amendment of the U.S. Constitution protects citizens from unreasonable search & seizure.
This is what Leifman said in a 2012 hearing before Florida's Florida Supreme Court when questioned by judge Patricia Smith to address the problem of mentally ill people being arrested by police.
That hearing occurred three months after a local judge in Miami ordered the state of Florida to stop using police officers as stand-by for all mentally ill people, something the U.S. Supreme Court ruled as unconstitutional a month before.
"I think I've made it clear that the Fourth Amendment prohibits police officers from using non-law enforcement officers," Leifman said.
"And I believe in common law -

# **ROUGE**

In [190]:
from datasets import load_metric
rouge_metric = load_metric('rouge')

In [191]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

reference = dataset['train'][1]['highlights']

records = []

for model_name in summaries:
    rouge_metric.add(prediction = summaries[model_name], reference = reference )
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
    print('rouge_dict ', rouge_dict )
    records.append(rouge_dict)

pd.DataFrame.from_records(records, index = summaries.keys() )

rouge_dict  {'rouge1': 0.16666666666666666, 'rouge2': 0.043795620437956206, 'rougeL': 0.11594202898550726, 'rougeLsum': 0.15217391304347824}
rouge_dict  {'rouge1': 0.3655913978494624, 'rouge2': 0.13186813186813184, 'rougeL': 0.2150537634408602, 'rougeLsum': 0.3225806451612903}
rouge_dict  {'rouge1': 0.5, 'rouge2': 0.24489795918367346, 'rougeL': 0.36000000000000004, 'rougeLsum': 0.46}


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
gpt2,0.166667,0.043796,0.115942,0.152174
bart,0.365591,0.131868,0.215054,0.322581
pegasus,0.5,0.244898,0.36,0.46


# **SAMPLE DATASET FOR 2500 ROWS**

In [222]:
sample_text1 = dataset["train"][1]["article"][:2500]

# We'll collect the generated summaries of each model in a dictionary
summaries1 = {}

# **GPT-2**

In [223]:
from transformers import pipeline, set_seed

set_seed(42)

pipe = pipeline('text-generation', model = 'gpt2-medium' )

gpt2_query = sample_text1 + "\nTL;DR:\n"

pipe_out = pipe(gpt2_query, max_length = 700, clean_up_tokenization_spaces = True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [224]:
pipe_out

[{'generated_text': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people often won\'t do what they\'re told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid, delusional, and

In [225]:
pipe_out[0]["generated_text"][len(gpt2_query) :]


"The Eighth Amendment of the Constitution does not offer any protection from being arrested and prosecuted on the ground of insanity. So, mentally ill inmates are placed into jails just because many lawyers advise them they're likely to be convicted on charges of violence and assault or, later, murder. There is no reason to believe these mentally ill people don't deserve treatment. (Also: if your mental illness was aggravated for religious reasons, you should be able to avoid having to be institutionalized as often as not.) The majority of mentally ill people in jail are housed at Miami-Dade County Jail. They often have severe illnesses that are often exacerbated by their illness. There's a belief in Washington, D.C. among mental health advocates that if you're accused of a crime,"

In [226]:
summaries1['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

# **BART**

In [227]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text1)

In [228]:
pipe_out

[{'summary_text': 'Mentally ill inmates are housed on the "forgotten floor" of Miami-Dade jail. Judge Steven Leifman says many are charged with "avoidable felonies" He says the arrests often result from confrontations with police. "I am the son of the president," one inmate shouts at CNN.'}]

In [229]:
summaries1["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [230]:
summaries1["bart"]

'Mentally ill inmates are housed on the "forgotten floor" of Miami-Dade jail.\nJudge Steven Leifman says many are charged with "avoidable felonies" He says the arrests often result from confrontations with police.\n"I am the son of the president," one inmate shouts at CNN.'

# **PEGASUS**

In [231]:
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail"  )

pipe_out = pipe(sample_text1)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [232]:
pipe_out

[{'summary_text': 'Soledad O\'Brien takes users inside a jail where many inmates are mentally ill .<n>Inmates with the most severe mental illnesses are incarcerated until trial .<n>Most often, they face drug charges or charges of assaulting an officer .<n>"I am the son of the president. You need to get me out of here!" one man shouts .'}]

In [233]:
summaries1["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

# **Comparing Different Summaries**

In [234]:
print("GROUND TRUTH")

print(dataset['train'][1]['highlights'])


for model_name in summaries1:
    print(model_name.upper())
    print(summaries1[model_name])

GROUND TRUTH
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .
GPT2
The Eighth Amendment of the Constitution does not offer any protection from being arrested and prosecuted on the ground of insanity.
So, mentally ill inmates are placed into jails just because many lawyers advise them they're likely to be convicted on charges of violence and assault or, later, murder.
There is no reason to believe these mentally ill people don't deserve treatment.
(Also: if your mental illness was aggravated for religious reasons, you should be able to avoid having to be institutionalized as often as not.)
The majority of mentally ill people in jail are housed at Miami-Dade County Jail.
They often have severe illnesses that are often exacerbated by their illness.
There's a b

In [235]:
from datasets import load_metric
rouge_metric1 = load_metric('rouge')

In [236]:
rouge_names1 = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

reference1 = dataset['train'][1]['highlights']

records = []

for model_name in summaries1:
    rouge_metric1.add(prediction = summaries1[model_name], reference = reference1 )
    score1 = rouge_metric1.compute()
    rouge_dict1 = dict((rn, score1[rn].mid.fmeasure ) for rn in rouge_names1 )
    print('rouge_dict1 ', rouge_dict1 )
    records.append(rouge_dict1)

pd.DataFrame.from_records(records, index = summaries1.keys() )




rouge_dict1  {'rouge1': 0.23913043478260868, 'rouge2': 0.05494505494505494, 'rougeL': 0.10869565217391304, 'rougeLsum': 0.2282608695652174}
rouge_dict1  {'rouge1': 0.6595744680851063, 'rouge2': 0.3913043478260869, 'rougeL': 0.48936170212765956, 'rougeLsum': 0.5531914893617021}
rouge_dict1  {'rouge1': 0.3269230769230769, 'rouge2': 0.15686274509803924, 'rougeL': 0.26923076923076916, 'rougeLsum': 0.28846153846153844}


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
gpt2,0.23913,0.054945,0.108696,0.228261
bart,0.659574,0.391304,0.489362,0.553191
pegasus,0.326923,0.156863,0.269231,0.288462
