In this script we feed the news articles to `StableBeluge-7B` LLM in order to get the teacher summary observations that will be used throught the training phase of the lightweight BART model.

In [29]:
# requirements

#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install transformers -q
#!pip install sentence_transformers -q
#!pip install nltk -q

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import json
from time import time
from sentence_transformers import SentenceTransformer
import nltk
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from copy import deepcopy

In [2]:
#nltk.download('punkt')

In [2]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
0
NVIDIA GeForce RTX 4090


In [3]:
print("Available devices ", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f'Device {i}:', torch.cuda.get_device_name(i))

Available devices  2
Device 0: NVIDIA GeForce RTX 4090
Device 1: NVIDIA GeForce RTX 3060


In [4]:
print(torch.__version__)

2.0.1+cu118


In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    "stabilityai/StableBeluga-7B", 
    use_fast=True
)
model = AutoModelForCausalLM.from_pretrained(
    "stabilityai/StableBeluga-7B", 
    torch_dtype=torch.bfloat16,
    #low_cpu_mem_usage=True, 
    #device_map="auto"
)#.to("cuda")
model = model.to('cuda:0')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2").to('cuda:0')

In [8]:
torch.backends.cudnn.benchmark = True

In [9]:
SYS_PROMPT = "### System:\nYou are StableBeluga, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal.\n\n"

In [10]:
def print_progress_bar(iteration, total, bar_length=50):
    progress = float(iteration) / float(total)
    arrow = '=' * int(round(progress * bar_length) - 1)
    spaces = ' ' * (bar_length - len(arrow))

    print(f'Progress: [{arrow + spaces}] {int(progress * 100)}%', end='\r')
    
def get_most_relevant_sentences(article_text, embeddings_model, top_n_perc=.5):
    """
    Extract only the "top_n_perc" sentences of a text "article_text".
    Top n sentences are considered the sentences that are most similar to the whole text.
    It is kind of a simple extractive summarization.
    Steps:
     1. The embedding of the whole text is computed. 
     2. The whole text is broken into individual sentences.
     3 Embeddings of each individual sentence are computed.
     4. Cosine similarity of each individual sentences against the embedding of the whole text is computed.
     5. Get top n sentences (sentences that more closely resembele the idea of the whole text).
    Receives:
     - article_text: str: The text of a news article.
     - embeddings_model: object: The model that will be used for embeddings. Model should have a .encode functionality to compute the embeddings.
    Returns:
     - Text containing only the top n % most representative sentences of a text.
    """
    # compute embedding of the whole text
    whole_text_embedding = embeddings_model.encode(article_text, show_progress_bar=False)
    # break text in sentences
    sentences = nltk.sent_tokenize(article_text)
    # store the sentences in a DataFrame
    sentences_df = pd.DataFrame(sentences, columns=['sentence'])
    # compute embeddings of each sentence individually
    sentences_embeddings = embeddings_model.encode(sentences, show_progress_bar=False)
     
    # compute cosine similarities of the whole text vs each individual sentence
    cosine_sims = cosine_similarity(
        whole_text_embedding.reshape(1, -1), 
        sentences_embeddings
    )
    # store cosine similarities on a column of the DataFrame
    sentences_df['similarity'] = cosine_sims[0]
    sentences_df.reset_index(inplace=True)
    # n sentences tied to top_n_perc of the article
    top_n = round(len(sentences)*top_n_perc)
    # Top n percent sentences that capture the main idea, sorted by how they appear in the text
    most_relevant_sentences = sentences_df.sort_values(
        # sort by similarity
        by='similarity', 
        # most similars at the top
        ascending=False
    ).head(
        # top 20%
        top_n
    ).sort_values(
        # sort them back by how they appear in the original text
        by='index'
    )[
        # get senteces
        'sentence'
    ].values.tolist() # to python list
    return ' '.join(most_relevant_sentences)

def summarize_article(article_text, model, tokenizer, top_n_perc = .2):
    """
    Summarize a single article text by using Stable Beluga
    """
    start = time()
    message = """
    
    Please summarize this:
    
    {0}
    """.format(article_text)
    prompt = f"{SYS_PROMPT}### User: {message}\n\n### Assistant:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0")
    token_length = inputs["input_ids"].shape[1]
    if token_length > 4096:
        #print("token lengths were greater than 4096, summarizing the most relevant sentences")
        # summarize only the most importante sentences of the article
        most_relevant_pieces = get_most_relevant_sentences(
            article_text, 
            embedding_model, 
            top_n_perc = top_n_perc
        )
        summary = summarize_article(most_relevant_pieces, model, tokenizer, top_n_perc = .1)
        return summary
    with torch.no_grad():
        output = model.generate(**inputs, do_sample=False, top_p=0.95, top_k=0, max_new_tokens=150)
    response = tokenizer.decode(output[0], skip_special_tokens=False)
    end = time()
    #print(end-start)
    return response, end-start

def summarize_news_articles(news_articles_to_summarize, already_summarized_news_articles, model, tokenizer):
    """
    Trigger the batch summarization process for all news articles
    """
    total_len = len(news_articles_to_summarize)
    for i, art in enumerate(news_articles_to_summarize):
        this_art = deepcopy(art)
        output, time_taken = summarize_article(
            this_art.get('content_en'), 
            model, 
            tokenizer
        )
        assitant_ix = output.find("### Assistant:")
        summary_ix = assitant_ix+len("### Assistant:\n ")
    
        summary = output[summary_ix:]
        this_art['summary'] = summary
        already_summarized_news_articles += [this_art]
        
        if i%100 == 0:
            with open('datasets/summarized_news.json', 'w') as f:
                json.dump(already_summarized_news_articles, f)

        if i%10 == 0:
            print_progress_bar(iteration=i, total=total_len)
            torch.cuda.empty_cache()
    with open('datasets/summarized_news.json', 'w') as f:
        json.dump(already_summarized_news_articles, f)

In [11]:
# load the translated news articles
path_file = 'datasets/translated_news.json'
with open(path_file, 'r') as jfile:
    news_articles = json.load(jfile)

In [12]:
# sofar summarized news
path_file = 'datasets/summarized_news.json'
with open(path_file, 'r') as jfile:
    summarized_news_articles = json.load(jfile)

In [13]:
summaries_df = pd.DataFrame.from_dict(summarized_news_articles)

already_summarized = set(summaries_df['h1'])

all_news_df = pd.DataFrame.from_dict(news_articles)

all_news_titles = set(all_news_df['h1'])

pending_summary = all_news_titles - already_summarized

articles_pending_summary_df = all_news_df[all_news_df['h1'].isin(pending_summary)]

articles_pending_summary = articles_pending_summary_df.to_dict(orient='records')

In [14]:
len(already_summarized)

286120

In [15]:
len(articles_pending_summary)

6305

In [16]:
len(already_summarized)/(len(already_summarized)+len(articles_pending_summary))

0.9784389159613576

In [17]:
summarize_news_articles(
    news_articles_to_summarize=articles_pending_summary, 
    already_summarized_news_articles=summarized_news_articles,
    model=model, 
    tokenizer=tokenizer
)

Progress: [===                                               ] 7%

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.




Token indices sequence length is longer than the specified maximum sequence length for this model (10985 > 4096). Running this sequence through the model will result in indexing errors




In [19]:
len(summarized_news_articles)

295174

In [20]:
summarized_news_articles[145326]

{'h1': 'Revocación de mandato es posible gracias al INE, presume Lorenzo Córdova\xa0',
 'h2': 'El consejero presidente del INE, también desmintió que el Instituto se niegue a promover la consulta así como debates sobre las distintas posturas sobre el tema\xa0\r\n',
 'h3': 'Más Información',
 'date': '27/02/2022 16:58',
 'author': 'Redacción ',
 'content': 'Tras rechazar que el Instituto Nacional Electoral ( INE ) esté en contra del proceso de revocación de mandato , su presidente, Lorenzo Córdova Vianello, aseguró que este inédito ejercicio de democracia participativa es posible gracias al órgano electoral. “Es falso que el INE pretenda obstaculizar la revocación de mandato; todo lo contrario: si la revocación de mandato va es gracias al INE y a las miles de personas que están siendo capacitadas para instalar las casillas, recibir y contar el voto de sus vecinos y vecinas el próximo 10 de abril”, subrayó en un video publicado en las redes sociales. Lorenzo Córdova recordó que es una fa