In this script we prepare the Data for the BART fine tuning task by making sure that the tokenized text and summaries dont exceed 1024 tokens

In [1]:
import json
import pandas as pd
import torch
import numpy as np
import time
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, Dataset
import nltk
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model_name = "facebook/bart-large"  # or another BART variant
tokenizer = BartTokenizer.from_pretrained(model_name)

In [3]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2").to('cuda:0')

In [4]:
def print_progress_bar(iteration, total, bar_length=50):
    progress = float(iteration) / float(total)
    arrow = '=' * int(round(progress * bar_length) - 1)
    spaces = ' ' * (bar_length - len(arrow))

    print(f'Progress: [{arrow + spaces}] {int(progress * 100)}%', end='\r')
    
def finish_summary(summary_str):
    summ_splitted = summary_str.split('.')
    finished_summ = '.'.join(summ_splitted[:-1])
    finished_summ += '.'
    return finished_summ

def get_most_relevant_sentences(article_text, embeddings_model, top_n_perc = .2):
    """
    Extract only the "top_n_perc" sentences of a text "article_text".
    Top n sentences are considered the sentences that are most similar to the whole text.
    It is kind of a simple extractive summarization.
    Steps:
     1. The embedding of the whole text is computed. 
     2. The whole text is broken into individual sentences.
     3 Embeddings of each individual sentence are computed.
     4. Cosine similarity of each individual sentences against the embedding of the whole text is computed.
     5. Get top n sentences (sentences that more closely resembele the idea of the whole text).
    Receives:
     - article_text: str: The text of a news article.
     - embeddings_model: object: The model that will be used for embeddings. Model should have a .encode functionality to compute the embeddings.
    Returns:
     - Text containing only the top n % most representative sentences of a text.
    """
    # compute embedding of the whole text
    whole_text_embedding = embeddings_model.encode(article_text, show_progress_bar=False)
    # break text in sentences
    sentences = nltk.sent_tokenize(article_text)
    # store the sentences in a DataFrame
    sentences_df = pd.DataFrame(sentences, columns=['sentence'])
    # compute embeddings of each sentence individually
    sentences_embeddings = embeddings_model.encode(sentences, show_progress_bar=False)
     
    # compute cosine similarities of the whole text vs each individual sentence
    cosine_sims = cosine_similarity(
        whole_text_embedding.reshape(1, -1), 
        sentences_embeddings
    )
    # store cosine similarities on a column of the DataFrame
    sentences_df['similarity'] = cosine_sims[0]
    sentences_df.reset_index(inplace=True)
    # n sentences tied to top_n_perc of the article
    top_n = round(len(sentences)*top_n_perc)
    # Top n percent sentences that capture the main idea, sorted by how they appear in the text
    most_relevant_sentences = sentences_df.sort_values(
        # sort by similarity
        by='similarity', 
        # most similars at the top
        ascending=False
    ).head(
        # top 20%
        top_n
    ).sort_values(
        # sort them back by how they appear in the original text
        by='index'
    )[
        # get senteces
        'sentence'
    ].values.tolist() # to python list
    return ' '.join(most_relevant_sentences)

# Load Data Set
DataSet consists of 295,174 news articles scrapped from a Mexican Newspaper, along with its summary. Summaries were created using `StableBeluga-7B` as the teacher. I left the LLM running for several days (weeks) in order to get all the summaries.

Relevant Columns:
- `h1`: Is the title of the news articles. Is used as key to merge with the news articles information.
- `date`: Date and time when the news articles was published.
- `author`: Name of the author who published the article.
- `content`: Article text in spanish.
- `h1_en`: Article title translated to english.
- `content_en`: Article text translated to English.
- `summary`: Summary generated by `StableBeluga-7B`

In [5]:
# Load the StableBeluga summaries data set
with open('datasets/summarized_news.json', "r") as f:
    summaries = json.load(f)  

In [6]:
summs_df = pd.DataFrame.from_dict(summaries)

In [7]:
summs_df.shape

(295174, 12)

In [8]:
summs_df.head(5)

Unnamed: 0,h1,h2,h3,date,author,content,h1_en,h2_en,content_en,content_len,summary,article_summary_similarity
0,Cancelan “Noche de Rábanos” en Oaxaca por ries...,"A través de un video, el gobernador Alejandro ...",Más Información,23/12/2021 09:09,Fernando Miranda / Corresponsal,.– Ante el aumento de casos de Covid-19 y en ...,“Night of Radishes” Canceled in Oaxaca by Risk...,"Through a video, Governor Alejandro Murat repo...",.– In view of the increase in Covid-19 cases a...,,The government of Oaxaca cancelled the traditi...,0.834634
1,"Pepenadores se casan en relleno sanitario, don...",Con apoyo de las autoridades de Ciudad Victori...,Más Información,17/12/2019 00:02,Redacción El Universal,Jesús Gallegos y Juana Martínez se conocieron ...,"Pepenadores get married in sanitary filling, w...",With the support of the authorities of Ciudad ...,Jesús Gallegos and Juana Martínez met five yea...,,Jesús Gallegos and Juana Martínez met five yea...,0.769818
2,Ladrón se mete a casa en Coyoacán de diputada ...,La legisladora Edna Laura Huerta Ruiz dijo a l...,Más Información,29/03/2021 14:41,Redacción,"La diputada federal por Morena, Edna Laura Hue...",Thief enters home in Coyoacán as federal deput...,Legislator Edna Laura Huerta Ruiz told the aut...,"The federal deputy for Morena, Edna Laura Huer...",,Federal deputy Edna Laura Huerta Ruiz was a vi...,0.718933
3,Automovilistas evitan atraco y tunden a golpes...,Los conductores que atestiguaron un robo sobre...,Más Información,15/09/2021 09:31,Redacción El Universal,Tras asaltar a punta de pistola a tres automo...,Motorists avoid robbery and beat up thief in I...,The drivers who testified to a robbery over th...,"After assaulting three motorists at gunpoint, ...",,A repeat thief who assaulted three motorists a...,0.819494
4,Mancera ve condiciones para que Senado apruebe...,El coordinador del PRD reconoció que la resolu...,Más Información,12/05/2022 16:41,Redacción,El senador Miguel Ángel Mancera aseguró que h...,Mancera sees conditions for Senate to approve ...,The PRD coordinator acknowledged that the Cour...,Senator Miguel Ángel Mancera said that there a...,,Senator Miguel Ángel Mancera has said that the...,0.85737


In [25]:
rand_ix = np.random.randint(200000)
cont = summs_df.iloc[rand_ix]['content_en']
summ = summs_df.iloc[rand_ix]['summary']

In [26]:
print('Content: \n{0}'.format(cont))
print('---------')
print('Summary: \n{0}'.format(summ))

Content: 
Today is July 21st and that means that Dog Day finally arrived. In Mexico, at least 43 million households have a dog as a pet, according to data from the National Institute of Statistics and Geography (INEGI) In addition, it is estimated that 9 out of 10 people make a trip with their dog instead of doing it alone. So, to enjoy this day walking with your best friend in the car, we leave you here 5 tips for your can enjoy the trip as much as you with your company: Feeding your dog minutes before leaving is not recommended, as it is likely that you can suffer from dizziness along the way. The best thing would be to feed it about 4 hours before leaving, so that it has enough energy and avoid problems in the car. Read also: How to remove the smell of cigarette from the car On trips of a considerable distance, it is advisable to make some stops every couple of hours to help your dog destress from the movement and hustle of the car. Give him a break, take him for a few laps to stret

In [27]:
# Some summaries are rouchly finished due to the `max_new_tokens` constraint of the LLM, so for those cases, perform a gentle finish.
summs_df['complete_summary'] = summs_df['summary'].apply(lambda x: finish_summary(x))

In [28]:
summs_df = summs_df[['content_en', 'complete_summary']].rename(
    columns = {
        'content_en':'article', 
        'complete_summary':'summary'
    }
).copy()

In [None]:
# Some summaries might exceed 1024 tokens (BART-large hard limit) after tokenization. In those cases we perform a text shortening by extracting the
# top_n most relevant sentences of a text. We save the shortened text in order to use that as the true label.
# See the function `get_most_relevant_sentences` defined above for more details on how the text shortening is performed
shortened_articles = []
shortened_summaries = []
for ix, row in summs_df.iterrows():
    
    print_progress_bar(iteration=ix, total = summs_df.shape[0], bar_length=50)
    # article text
    article = row['article']
    # stable beluga summary
    summary = row['summary']
    
    # Start with the article text:
    
    # initial "top n" percent of the text to keep, in order to shorten text in case of longer than expected text
    perc_to_keep = 0.9
    while True:
        # tokenize the article text
        inputs = tokenizer(article)
        if len(inputs['input_ids'])>1024:
            # article text exceeded 1024 tokens, so shorten the article text (keep only top n %)
            article = get_most_relevant_sentences(
                article, 
                embedding_model, 
                top_n_perc = perc_to_keep
            )
            # decrease the percetage to keep by 10% in case article text still exceed 1024 tokens
            perc_to_keep -= 0.1
            # keep iterating the loop
        else:
            # article text no longer exceeds 1024 tokens
            break
    # save the article text that didn't exceed 1024 tokens
    shortened_articles.append(article)
    
    # Now for the summaries:
    
    # initial "top n" percent of the text to keep, in order to shorten text in case of longer than expected summaries
    perc_to_keep = 0.9
    while True:  
        # tokenize the summary
        targets = tokenizer(summary)
        if len(targets['input_ids'])>1024:
            # summary exceeded 1024 tokens, so shorten the summary (keep only top n %)
            summary = get_most_relevant_sentences(
                summary, 
                embedding_model, 
                top_n_perc = perc_to_keep
            )
            # decrease the percetage to keep by 10% in case summary still exceeds 1024 tokens
            perc_to_keep -= 0.1
            # keep iterating the loop
        else:
            # summary no longer exceeds 1024 tokens
            break
    # save the asummary that didn't exceed 1024 tokens
    shortened_summaries.append(summary)

In [None]:
summs_df['short_art'] = shortened_articles
summs_df['short_summ'] = shortened_summaries

In [None]:
summs_df = summs_df[['short_art', 'short_summ']].rename(
    columns = {
        'short_art':'article', 
        'short_summ':'summary'
    }
).copy()

In [None]:
summs_df.to_json('BART_data_set.json', orient='records')