In [1]:
import numpy as np
import pandas as pd
from evaluate import load
from scipy.ndimage import label
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_from_disk
dataset = load_from_disk("cord_19_dataset_for_train")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['filename', 'Article', 'Summary', '__index_level_0__'],
        num_rows: 81506
    })
    validation: Dataset({
        features: ['filename', 'Article', 'Summary', '__index_level_0__'],
        num_rows: 27169
    })
    test: Dataset({
        features: ['filename', 'Article', 'Summary', '__index_level_0__'],
        num_rows: 27169
    })
})


In [3]:
testdataset = dataset["test"]

In [4]:
testdataset.column_names

['filename', 'Article', 'Summary', '__index_level_0__']

In [5]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
model_name="./bertsum_cord19/bertsum_finetuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,  # Binary classification per sentence
    problem_type="multi_label_classification"
)

In [7]:
import torch

def generate_summary(model, tokenizer, text, device):
    model.eval()
    
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the correct device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        output = model(**inputs)  # Forward pass on MPS
        logits = output.logits.squeeze(-1)  # Extract logits
        
        # Ensure logits are moved to CPU before processing
        logits = logits.cpu()

        # Select sentences using thresholding
        predicted_labels = (logits > 0.5).int()
        
        sentences = text.split(". ")  # Sentence tokenization
        min_length = min(len(sentences), len(predicted_labels))
        # print("Logits:", logits)
        # print("Predicted Labels:", predicted_labels)
        # print("Sentences:", sentences)
        selected_sentences = [sentences[i] for i in range(min_length) if predicted_labels[i] == 1]        
        summary = " ".join(selected_sentences)
        return summary

# Detect MPS device on Mac
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)  # Move model to MPS


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# Example Usage:
text = """
Jamieson issues warning to bigots\n\nScotland's justice minister has warned bigoted soccer fans that she wants to hit them "where it hurts most" by banning them from matches.\n\nCathy Jamieson said exclusion orders are one of a series of measures being considered in the Scottish Executive campaign against sectarianism. She praised Celtic and Rangers for their work in tackling the problem. However, the minister said stopping sectarian abuse associated with Old Firm matches is a key objective. Ms Jamieson was speaking ahead of the third round Scottish Cup clash between the Glasgow clubs at Parkhead on Sunday. The sectarianism long associated with sections of the support from both clubs has become a significant target for the executive. Last week Ms Jamieson and First Minister Jack McConnell met supporters' representatives from both clubs to discuss the issue.\n\nThey plan to hold an anti-sectarian summit next month with officials from the clubs, church leaders, senior police officers and local authority chiefs among those to be invited. Speaking on BBC Radio Scotland's Sunday Live programme, Ms Jamieson described Friday's meeting as "very productive" and said putting the squeeze on the bigots would be a key aim. Ms Jamieson stressed that sectarianism has not been confined to football but it can act as a "trigger" for tensions and violence. Clubs have taken action in the past to ban troublesome fans and supporters' groups expressed their desire to ensure that the game is no longer tainted by the problem.\n\nMs Jamieson said the executive should have a role in tackling the soccer troublemakers. She said: "We can't get away from the fact that in some instances some of the religious hatred that some people try to associate with football boils over into violence. "That is the kind of thing we want to stop and that's the kind of thing supporters' groups are very clear they don't want to be part of either, and they will work with us to try and deal with that."\n\nMs Jamieson praised the police for their action and said: "The police do want to identify whether there are particular individuals who are going over the top and inciting hatred or violence - they will crack down very effectively on them. "We have of course already indicated that we will consider the introduction of banning orders to give additional powers to where there are people who are going over the top, who have made inappropriate behaviour at football matches, to be able to stop them attending the games. "That's the kind of thing that will hit those kind of people where it hurts the most in not allowing them to attend the games," she said. Praising Celtic and Rangers for their efforts, she said: "I don't think there is any doubt that we have seen some positive moves from the clubs. "Both Rangers and Celtic football clubs have been involved in working with the executive to produce, for example, an educational pack for  """
summary = generate_summary(model, tokenizer, text, device)
print("Generated Summary:", summary)


Generated Summary: 

Scotland's justice minister has warned bigoted soccer fans that she wants to hit them "where it hurts most" by banning them from matches.

Cathy Jamieson said exclusion orders are one of a series of measures being considered in the Scottish Executive campaign against sectarianism


In [9]:
testdf=testdataset.to_pandas()

In [10]:
testdf['generated_summary'] = testdf['Article'].apply(lambda x:str(generate_summary(model, tokenizer, x, device)))

In [11]:
testdf["generated_summary"].value_counts()

generated_summary
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   165
The online version contains supplementary material available at https:// doi                                                   

In [12]:
testdf.replace(r'^\s*$', np.nan, regex=True)
testdf.dropna(axis=0, how='any')
# Remove rows where "Title" is empty or null
testdf = testdf[testdf["generated_summary"].notna() & (testdf["generated_summary"] != "")]
len(testdf)

27004

In [13]:
testdf=testdf.drop_duplicates(subset=["generated_summary"])
testdf['generated_summary'].value_counts()

generated_summary
The identification of outbreaks caused by the SARS-CoV-2 BA.1 Omicron VOC demonstrated the high transmissibility and immune evasion of such a variant [9]                                                                                                                                                                                                                                           1
In this single-center analysis, we found a similar incidence of IPA as superinfection of influenza and COVID-19                                                                                                                                                                                                                                                                                     1
The presence of these divergent directions that the fertility behaviour of existing couples may take, as well as the previously mentioned birth effects of delayed marriages, will ultimately determine th

In [14]:
from rouge import Rouge

rouge = Rouge()

def truncate_text(text, max_words=100):
    return " ".join(text.split()[:max_words])

def compute_rouge(reference, generated):
    if not generated.strip():  # Handle empty strings
        return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
    
    # Truncate long summaries
    reference = truncate_text(reference)
    generated = truncate_text(generated)
    
    scores = rouge.get_scores(generated, reference)
    return scores[0]

# Apply function
testdf["rouge_scores"] = testdf.apply(lambda row: compute_rouge(row["Summary"], row["generated_summary"]), axis=1)



In [15]:
testdf.head()

Unnamed: 0,filename,Article,Summary,__index_level_0__,generated_summary,rouge_scores
0,d42ab1c58711535d40ca370ff9461c03bfd79f25.json,The identification of outbreaks caused by the ...,The identification of outbreaks caused by the ...,66286,The identification of outbreaks caused by the ...,"{'rouge-1': {'r': 0.2972972972972973, 'p': 1.0..."
1,12524fc4d3d52cb18c5bf886178b8b88d44e4bd0.json,Woohyun Jung https://orcid.org/0000-0002-4980-...,Woohyun Jung https://orcid.org/0000-0002-4980-...,162636,Woohyun Jung https://orcid.org/0000-0002-4980-...,"{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999..."
2,e519e407b1ce798d6110f679370afa416e569633.json,"That said, the present research allowed an ass...","That said, the present research allowed an ass...",35453,"That said, the present research allowed an ass...","{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999..."
3,ddf88f7f1f74746cc887ff14e002b8cded71551b.json,"Since for any constant c > 0, the infinite sum","Since for any constant c > 0, the infinite sum",39077,"Since for any constant c > 0, the infinite sum","{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999..."
4,532cdf96996c52c7eb5e83399591daf3dfe8db5c.json,The authors declare consent for publication of...,The authors declare consent for publication of...,5042,The authors declare consent for publication of...,"{'rouge-1': {'r': 1.0, 'p': 1.0, 'f': 0.999999..."


In [16]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-l"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-L Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-L Score: 0.6048


In [17]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-2"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-2 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-2 Score: 0.5530


In [18]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-1"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-1 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-1 Score: 0.6063


In [21]:
 testdf["article_len"] = testdf.apply(lambda row: len(row["Article"]), axis=1)
 filtered_df = testdf[testdf["article_len"] >= 1500]

In [22]:
len(filtered_df)

2308

In [23]:
rouge_l_scores = [score["rouge-l"]["f"] for score in filtered_df["rouge_scores"]]
print(f"Average ROUGE-L Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-L Score: 0.3186
