In [1]:
import json
import pandas as pd
import torch
import numpy as np
from time import time
from transformers import BartForConditionalGeneration, BartTokenizer
from sentence_transformers import SentenceTransformer
import nltk
from sklearn.metrics.pairwise import cosine_similarity

class BARTSummarizer():
    """
    BART Summarizer model, fine tuned with "El Universal" news.
    True label summaries were generated with StableBeluga7B LLM.
    """
    def __init__(self, path_to_model = "./model_save/"):
        """
        Loads the BART pretrained model
        """
        # Load pre-trained BART model and tokenizer
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model_name = "{0}bart_summarizer".format(path_to_model) 
        self.model = BartForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.tokenizer = BartTokenizer.from_pretrained(model_name)
        self.model.eval()
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2").to(self.device)
        
    def get_most_relevant_sentences(article_text, embeddings_model, top_n_perc=.5):
        """
        Extract only the "top_n_perc" sentences of a text "article_text".
        Top n sentences are considered the sentences that are most similar to the whole text.
        It is kind of a simple extractive summarization.
        Steps:
         1. The embedding of the whole text is computed. 
         2. The whole text is broken into individual sentences.
         3 Embeddings of each individual sentence are computed.
         4. Cosine similarity of each individual sentences against the embedding of the whole text is computed.
         5. Get top n sentences (sentences that more closely resembele the idea of the whole text).
        Receives:
         - article_text: str: The text of a news article.
         - embeddings_model: object: The model that will be used for embeddings. Model should have a .encode functionality to compute the embeddings.
        Returns:
         - Text containing only the top n % most representative sentences of a text.
        """
        # compute embedding of the whole text
        whole_text_embedding = embeddings_model.encode(article_text, show_progress_bar=False)
        # break text in sentences
        sentences = nltk.sent_tokenize(article_text)
        # store the sentences in a DataFrame
        sentences_df = pd.DataFrame(sentences, columns=['sentence'])
        # compute embeddings of each sentence individually
        sentences_embeddings = embeddings_model.encode(sentences, show_progress_bar=False)
         
        # compute cosine similarities of the whole text vs each individual sentence
        cosine_sims = cosine_similarity(
            whole_text_embedding.reshape(1, -1), 
            sentences_embeddings
        )
        # store cosine similarities on a column of the DataFrame
        sentences_df['similarity'] = cosine_sims[0]
        sentences_df.reset_index(inplace=True)
        # n sentences tied to top_n_perc of the article
        top_n = round(len(sentences)*top_n_perc)
        # Top n percent sentences that capture the main idea, sorted by how they appear in the text
        most_relevant_sentences = sentences_df.sort_values(
            # sort by similarity
            by='similarity', 
            # most similars at the top
            ascending=False
        ).head(
            # top 20%
            top_n
        ).sort_values(
            # sort them back by how they appear in the original text
            by='index'
        )[
            # get senteces
            'sentence'
        ].values.tolist() # to python list
        return ' '.join(most_relevant_sentences)

    def _tokenize_article(self, article_text):
        """
        Tokenize an article text. If tokenized articles is greater than 1024 tokens,
        shorten the text, keep shortening until text no longer exceeds 1024 tokens
        """
        perc_to_keep = 0.9
        while True:
            inputs = self.tokenizer(article_text, return_tensors='pt')
            if inputs['input_ids'].shape[1]>1024:
                # tokenized article exceeded 1024 tokens
                # shorten the text
                article_text = get_most_relevant_sentences(
                    article_text, 
                    top_n_perc = perc_to_keep
                )
                perc_to_keep -= 0.1
                # keep iterating
            else:
                # text no longer exceed 1024 tokens
                break
        return inputs['input_ids'].to(self.device)
        
    def summarize(self, article_txt):
        """
        Perform the summarization task
        """
        inputs = self._tokenize_article(article_txt)
        with torch.no_grad():
            summary_ids = self.model.generate(
                inputs, 
                num_beams=4, 
                max_length=250, 
                early_stopping=True
            )
        summary = self.tokenizer.decode(
            summary_ids[0], 
            skip_special_tokens=True
        )
        return summary

In [2]:
# Load the StableBeluga summaries validation data set
with open('datasets/BART_validation_data.json', "r") as f:
    validation_set = json.load(f)

In [3]:
summarizer = BARTSummarizer()

In [4]:
ix = 0
art = validation_set[ix]['article']
sb_summ = validation_set[ix]['summary']
summary = summarizer.summarize(art)

print("News Article:")
print(art)
print('----------------')
print("BART Summary:")
print(summary)
print('----------------')
print("StableBeluga Summary:")
print(sb_summ)

News Article:
The divorce between Alfredo Adame and Mary Paz Banquells, more than bad things, seems to have brought prosperity to the actress's house and her three children, because after the separation and the arrival of the pandemic, they found themselves in a compromised economic situation, which led her and them to start businesses that started being home and today, almost three years later, have become a success. In their visit to “Winner”, Mary Paz Banquells and her eldest son, Diego Adame, spoke of the dessert business they have undertaken as a family, after the pandemic by Covid-19 forced them to put a pause in their business, in which they hosted puppies from families who went on a trip and left their pets to their care. Diego recalled that, from one day to the next, the profits generated by babysitting doggies were frozen, because people were confined, so they could not leave the house and, therefore, they did not need to take care of their doggies, on the contrary, they coul

In [5]:
ix = 1
art = validation_set[ix]['article']
sb_summ = validation_set[ix]['summary']
summary = summarizer.summarize(art)

print("News Article:")
print(art)
print('----------------')
print("BART Summary:")
print(summary)
print('----------------')
print("StableBeluga Summary:")
print(sb_summ)

News Article:
portfolio@eluniversal.com.mx The governor of the Banco de México (Banxico), Alejandro Díaz de León, said that the national economy has stagnated in the face of an environment of internal uncertainty and external factors that pressure the performance of the country. “The average of the last four to five quarters has a very low growth, we could talk about a certain stagnation in the economic activity,” he said. By participating in the Banorte Strategy Forum 2019, Díaz de León stressed that among the elements that have led to this condition of the Mexican economy is the contraction in the industrial production, as well as a slowdown in the services. “We are facing an environment of lower internal economic slowdown than what was anticipated and the external environment has not helped.” In the opinion of the official, weaknesses in governance prevail in the country that affect the growth, such is the use of insecurity and the strengthening of the rule of law. He added that the

In [6]:
len(validation_set)

44277

In [14]:
ix = np.random.randint(0,len(validation_set))
art = validation_set[ix]['article']
sb_summ = validation_set[ix]['summary']
summary = summarizer.summarize(art)

print("News Article:")
print(art)
print('----------------')
print("BART Summary:")
print(summary)
print('----------------')
print("StableBeluga Summary:")
print(sb_summ)

News Article:
Zacatecas.- The state and federal electoral authorities in Zacatecas confirmed that the basic box 667 could not be installed in the community of Guadalupe Victoria in the municipality of Jerez, because the cell officials were threatened with death, therefore, the non-installation of that box was determined for the safety of the electors and officials. This confirmation was announced in the first incident court issued in the sessions of the electoral councils of the National Electoral Institute (INE) and the Electoral Institute of the State of Zacatecas (IEEZ), so it was determined that the electoral packages of this cell will be returned intact to the facilities of the district and municipal councils. During the session of the General Council of the IEEZ held at 11:00 a.m. for the first cut of incidents, it was announced that in the Incident System of the Election Day (SIJE) was confirmed the non-final installation of a “destination box” in the municipality of Jerez, whic

In [7]:
def print_progress_bar(iteration, total, bar_length=50):
    progress = float(iteration) / float(total)
    arrow = '=' * int(round(progress * bar_length) - 1)
    spaces = ' ' * (bar_length - len(arrow))

    print(f'Progress: [{arrow + spaces}] {int(progress * 100)}%', end='\r')

In [8]:
validation_df = pd.DataFrame.from_dict(validation_set)

In [9]:
bart_summaries = []
for i, entry in enumerate(validation_set):
    print_progress_bar(iteration=i, total=len(validation_set))
    summary = summarizer.summarize(entry['article'])
    bart_summaries.append(summary)



In [10]:
validation_df['BART_summaries'] = bart_summaries

In [12]:
validation_df.to_json('datasets/bart_summaries_val_set.json', orient='records')

## Compare summaries by ROGUE metric

In [15]:
import evaluate

In [18]:
rouge = evaluate.load('rouge')

In [20]:
instruct_model_results = rouge.compute(
    predictions=bart_summaries,
    references=validation_df['summary'],
    use_aggregator=True,
    use_stemmer=True,
)

print('INSTRUCT MODEL ROGUE METRIC:')
print(instruct_model_results)

INSTRUCT MODEL ROGUE METRIC:
{'rouge1': 0.6629992148125218, 'rouge2': 0.46701095620010635, 'rougeL': 0.5513249818595796, 'rougeLsum': 0.5515254233129148}


## Compare summaries using Cosine Similarity

In [22]:
emb_model = summarizer.embedding_model

In [25]:
embs1 = emb_model.encode(validation_df['summary'])
embs2 = emb_model.encode(validation_df['BART_summaries'])

In [27]:
# Compute cosine similarity
sims_mat = cosine_similarity(embs1, embs2)

In [28]:
similarities = np.diag(sims_mat)

In [30]:
pd.Series(similarities).describe()

count    44277.000000
mean         0.889844
std          0.080382
min          0.022816
25%          0.858303
50%          0.906960
75%          0.942281
max          1.000000
dtype: float64

The model, in average, is producing an 88% of similar summaries. On the statistics, from the minimum and the 50% statistics, we can see that there might be outliers that are dragging the average down to 88%

In [31]:
validation_df['summary_similarity']=similarities

In [33]:
# outliers
validation_df[validation_df['summary_similarity']<.1]

Unnamed: 0,article,summary,BART_summaries,summary_similarity
80,,I will summarize the text for you. Please prov...,,0.060218
691,,I will summarize the text for you. Please prov...,,0.060218
1928,,I will summarize the text for you. Please prov...,,0.060218
2786,cvtp,I cannot provide a summary without more contex...,,0.039395
3176,,I will summarize the text for you. Please prov...,,0.060218
...,...,...,...,...
42195,,I will summarize the text for you. Please prov...,,0.060218
43166,,I will summarize the text for you. Please prov...,,0.060218
43671,,I will summarize the text for you. Please prov...,,0.060218
43918,,I will summarize the text for you. Please prov...,,0.060218


Outliers are comprised of empty article texts, which produce empty BART summaries

In [41]:
validation_df[
    (validation_df['BART_summaries']!='')&
    (validation_df['summary_similarity']<.1)
]

Unnamed: 0,article,summary,BART_summaries,summary_similarity
19938,They have earned a place in the taste of the p...,.,Mexican celebrities have achieved success in v...,0.036312
26570,jlcg,I cannot summarize this as it appears to be a ...,Jjlcg:,0.065454
35191,The National Weather Service (SMN) pointed out...,.,The National Weather Service (SMN) predicts th...,0.022816


We can see that outliers are present, due to article texts that are comprised of empty or incoherent text, as well as baseline summaries produced by the LLM that are not good summaries. We can further refine the model by removing this noisy observations from the train data. 

For the purpose of this excercise, we can use the 50% statistic, which is `.90`, as a similarity comparison between the summaries that our lightweigt BART model produces, against the summaries produced by the `StableBeluga-7B` LLM.

# Compare the time it takes for each model to perform the Summarization task

In [4]:
times = []
for entry in validation_set[:500]:
    start = time()
    summary = summarizer.summarize(entry['article'])
    end = time()
    secs = end-start
    times.append(secs)

In [8]:
# Average time for summarization is 1.2 seconds. That is 3x faster that the 3/4 seconds that Stable Beluga takes to summarize
np.mean(times)

1.2406021947860717

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    "stabilityai/StableBeluga-7B", 
    use_fast=True
)
model = AutoModelForCausalLM.from_pretrained(
    "stabilityai/StableBeluga-7B", 
    torch_dtype=torch.bfloat16,
    #low_cpu_mem_usage=True, 
    #device_map="auto"
)#.to("cuda")
model = model.to('cuda:0')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
SYS_PROMPT = "### System:\nYou are StableBeluga, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.\n\n"

In [10]:
def summarize_article(article_text, model, tokenizer, top_n_perc = .2):
    start = time()
    message = """
    
    Please summarize this:
    
    {0}
    """.format(article_text)
    prompt = f"{SYS_PROMPT}### User: {message}\n\n### Assistant:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0")
    with torch.no_grad():
        output = model.generate(**inputs, do_sample=False, top_p=0.95, top_k=0, max_new_tokens=250)
    response = tokenizer.decode(output[0], skip_special_tokens=False)
    end = time()
    #print(end-start)
    return response, end-start

In [11]:
times = []
for entry in validation_set[:500]:
    summary, secs = summarize_article(entry['article'], model, tokenizer)
    times.append(secs)



In [12]:
# Stable Beluga taked 3.75 secons in average, to perfom a summary
np.mean(times)

3.7542019882202147

The lightweigth BART model performs the summarization task 3x faster than the LLM.