In [1]:
from pymongo import MongoClient
from tqdm.notebook import tqdm

## Connect to MongoDB

In [2]:
mongo_client = MongoClient('localhost', 27017)
db = mongo_client.scientific_articles

## Get all the articles to summarize

In [3]:
articles_json = list(db.articles.find({"summary": {"$exists": False}}))

Load them into pandas

In [4]:
import pandas as pd

rows = []
for paper in tqdm(articles_json): 
    rows.append(
        (
            paper["link"],
            paper["body"],
            "",
        )
    )

df = pd.DataFrame(data=rows, columns=["link", "body", "summary"])
articles_json = []

  0%|          | 0/12769 [00:00<?, ?it/s]

## Instanciate BART

In [5]:
from transformers import BartForConditionalGeneration, AutoTokenizer

model_ckpt = "sshleifer/distilbart-cnn-6-6"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt)

## Summarize Docs

Tokenize the docs

In [7]:
import torch
from tqdm import tqdm
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_ids = [tokenizer(doc, padding='max_length', return_tensors='pt').to(device) for doc in tqdm(df["body"].to_list())]



Split tokens in pages of 1024 tokens

In [None]:
max_size = 1024

documents_tokenized = []

for input_id in input_ids:
    n_splits = math.ceil(len(input_id[0])/max_size)
    token_splits = []
    for index in list(range(n_splits)):
        if(index != n_splits-1):
            print(str(index*max_size) + " - " + str((index+1)*max_size))
            token_splits.append({ "input_ids": torch.tensor(input_id['input_ids'][0][index*max_size:(index+1)*max_size]).unsqueeze(0), 
                                "attention_mask": torch.tensor(input_id['attention_mask'][0][index*max_size:(index+1)*max_size]).unsqueeze(0)})
        else:
            print(str(index*max_size) + " - " + str(len(input_id[0])%max_size + index*max_size))
            token_splits.append({ "input_ids": torch.tensor(input_id['input_ids'][0][index*max_size:len(input_id[0])%max_size + index*max_size]).unsqueeze(0), 
                            "attention_mask": torch.tensor(input_id['attention_mask'][0][index*max_size:len(input_id[0])%max_size + index*max_size]).unsqueeze(0)})
    
    documents_tokenized.append(token_splits)

Summarize each page of 1024 tokens into 64 tokens

In [None]:
summaries = []

for doc_tokenized in tqdm(documents_tokenized):
    doc_summary = []
    for index in tqdm(list(range(len(doc_tokenized)))):
        doc_summary.append(model.generate(input_ids=doc_tokenized[index]['input_ids'], 
                            attention_mask=doc_tokenized[index]['attention_mask'],
                            min_length=16, 
                            max_length=64))
    summaries.append(doc_summary)

Concatenate the summaries obtained

In [None]:
text_summaries = []

for summary in summaries:
    text_summary = ""

    for split in summary:
        extracted_summary = tokenizer.decode(split[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
        if ("." in extracted_summary):
            text_summary += (".".join(extracted_summary.split(".")[0:-1])) + "\n"
        else:
            text_summary += extracted_summary + "\n"
    db.articles.updateOne({})
    # load summary to mongodb