Load in the tokenized sentences

In [7]:
import pandas as pd
from tqdm import tqdm

# Load the original CSV file
df = pd.read_csv('../sent.csv')

Concatenate them based on the paper_id so that each entry represents a single document.

In [17]:
# Concatenate sentences based on 'paper_id'
concatenated_df = df.groupby('paper_id')['text'].apply(lambda x: ' '.join(x)).reset_index()

In [4]:
concatenated_df.to_csv('../concatenated.csv', index=False)

Load BERTopic model for topic extraction

In [8]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired




In [18]:
# Each entry in the concatenated df is a paper
documents = concatenated_df['text'].tolist()

In [9]:
model = BERTopic(representation_model=KeyBERTInspired())

Extract the topics from the documents (this takes about 10 min)

In [34]:
# Fit the model and transform your data into topics
topics, _ = model.fit_transform(documents)

In [11]:
# Get the most frequent topics
most_frequent_topics = model.get_topic_freq()

In [36]:
print(most_frequent_topics)

     Topic  Count
6       -1   3583
11       0    353
18       1    294
30       2    247
29       3    177
..     ...    ...
37     151     11
92     152     11
156    153     11
66     154     10
145    155     10

[157 rows x 2 columns]


In [37]:
model.visualize_topics()

In [38]:
# Save the model
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
model.save("../BERTopic_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

In [10]:
model = BERTopic.load("../BERTopic_model")

Topic labels

In [38]:
# Create a list of labels for each topic
labels = model.generate_topic_labels(topic_prefix=False)

In [30]:
for label in labels:
    print(label)

pandemic_covid19_infection
pandemic_coronavirus_covid19
coronavirus_coronaviruses_covid19
classroom_students_educational
aerosol_aerosols_respiratory
publication_disclosure_disclose
diagnostic_assays_pcr
influenza_flu_pandemic
oncology_cancer_covid19
pandemic_covid19_anxiety
antiviral_covid19_hiv
respiratory_ventilatory_ventilator
oder_welche_eines
vaccine_vaccination_vaccines
telemedicine_telehealth_teledermatology
ai_classification_recognition
cytokines_inflammation_cytokine
pneumonia_lung_pulmonary
encephalitis_encephalopathy_neurological
pandemic_markets_economic
thromboprophylaxis_covid19_anticoagulation
antimicrobial_antibiotics_pathogens
metabolites_medicinal_extracts
proteins_protein_enzymes
myocarditis_myocardial_cardiac
rhinovirus_influenza_asthma
epidemic_outbreak_infectious
tweets_twitter_disinformation
dental_dentistry_dentist
pandemic_hospital_hospitals
loneliness_aging_depression
sustainability_sustainable_emissions
biosensors_biosensor_nanoparticles
pregnancy_pregnant_i

In [37]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3583,-1_pandemic_covid19_infection_disease,"[pandemic, covid19, infection, disease, virus,...",
1,0,353,0_pandemic_coronavirus_covid19_epidemic,"[pandemic, coronavirus, covid19, epidemic, out...",
2,1,294,1_coronavirus_coronaviruses_covid19_sarscov2,"[coronavirus, coronaviruses, covid19, sarscov2...",
3,2,247,2_classroom_students_educational_teaching,"[classroom, students, educational, teaching, e...",
4,3,177,3_aerosol_aerosols_respiratory_ventilation,"[aerosol, aerosols, respiratory, ventilation, ...",
...,...,...,...,...,...
152,151,11,151_coronavirus_covid19_pneumonia_infection,"[coronavirus, covid19, pneumonia, infection, c...",
153,152,11,152_pneumonia_infections_pathogens_infection,"[pneumonia, infections, pathogens, infection, ...",
154,153,11,153_vaccines_vaccine_vaccination_vaccinated,"[vaccines, vaccine, vaccination, vaccinated, i...",
155,154,10,154_flaviviruses_flavivirus_zika_dengue,"[flaviviruses, flavivirus, zika, dengue, viral...",


In [54]:
type(model.get_topic_info())

pandas.core.frame.DataFrame

In [39]:
model.get_document_info(documents).sort_values(by='Topic')

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Representative_document
3484,severe acute respiratory syndrome SARS is an a...,-1,-1_pandemic_covid19_infection_disease,"[pandemic, covid19, infection, disease, virus,...",,pandemic - covid19 - infection - disease - vir...,False
2460,the COVID-19 pandemic has dramatically changed...,-1,-1_pandemic_covid19_infection_disease,"[pandemic, covid19, infection, disease, virus,...",,pandemic - covid19 - infection - disease - vir...,False
7952,what do you do when your friends are starving ...,-1,-1_pandemic_covid19_infection_disease,"[pandemic, covid19, infection, disease, virus,...",,pandemic - covid19 - infection - disease - vir...,False
8918,n urse leaders NLs play a pivotal role in shap...,-1,-1_pandemic_covid19_infection_disease,"[pandemic, covid19, infection, disease, virus,...",,pandemic - covid19 - infection - disease - vir...,False
4847,there are some theories such as the Health Bel...,-1,-1_pandemic_covid19_infection_disease,"[pandemic, covid19, infection, disease, virus,...",,pandemic - covid19 - infection - disease - vir...,False
...,...,...,...,...,...,...,...
7541,to the Editor: The clinical impact of COVID-19...,155,155_covid19_fever_pneumonia_infection,"[covid19, fever, pneumonia, infection, comorbi...",,covid19 - fever - pneumonia - infection - como...,False
3028,coronavirus disease 2019 COVID- 19 has had a c...,155,155_covid19_fever_pneumonia_infection,"[covid19, fever, pneumonia, infection, comorbi...",,covid19 - fever - pneumonia - infection - como...,False
7647,the novel coronavirus disease 2019 COVID-19 ca...,155,155_covid19_fever_pneumonia_infection,"[covid19, fever, pneumonia, infection, comorbi...",,covid19 - fever - pneumonia - infection - como...,False
6337,covid-19 infection virus mainly infects elderl...,155,155_covid19_fever_pneumonia_infection,"[covid19, fever, pneumonia, infection, comorbi...",,covid19 - fever - pneumonia - infection - como...,False


In [35]:
document_info = model.get_document_info(documents).sort_values(by='Topic')
document_info = document_info.reset_index()

In [36]:
print(document_info)

      index                                           Document  Topic  \
0      3484  severe acute respiratory syndrome SARS is an a...     -1   
1      2460  the COVID-19 pandemic has dramatically changed...     -1   
2      7952  what do you do when your friends are starving ...     -1   
3      8918  n urse leaders NLs play a pivotal role in shap...     -1   
4      4847  there are some theories such as the Health Bel...     -1   
...     ...                                                ...    ...   
9995   7541  to the Editor: The clinical impact of COVID-19...    155   
9996   3028  coronavirus disease 2019 COVID- 19 has had a c...    155   
9997   7647  the novel coronavirus disease 2019 COVID-19 ca...    155   
9998   6337  covid-19 infection virus mainly infects elderl...    155   
9999   5726  cOVID-19, caused by novel corona virus of seve...    155   

                                       Name  \
0     -1_pandemic_covid19_infection_disease   
1     -1_pandemic_covid19_inf

In [64]:
from transformers import pipeline
from bertopic import BERTopic
import pandas as pd
from tqdm import tqdm

In [96]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def extractive_summarization(text, num_sentences=3):
    """
    Summarize text using LexRank algorithm for extractive summarization.

    Parameters:
    - text: The input text to summarize.
    - num_sentences: The number of sentences to include in the summary.

    Returns:
    - The extractive summary of the text as a string.
    """
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()

    # Summarize the text with the specified number of sentences
    summary = summarizer(parser.document, num_sentences)

    # Convert the summary sentences back into a single string
    summary_text = " ".join([str(sentence) for sentence in summary])
    return summary_text


In [87]:
def get_relevant_sentences(model, documents):
    """
    Retrieve the relevant sentences for each topic in the documents.

    Parameters:
    - model: The BERTopic model.
    - documents: The list of documents.

    Returns:
    - The list of relevant sentences for each topic.
    """

    # Load the document-topic distribution and topic info
    document_info = model.get_document_info(documents)
    # Get topic information, including the 'representation' column for keywords
    topic_info = model.get_topic_info()

    topic_sentences = []

    for index, topic_row in tqdm(topic_info.iterrows(), total=topic_info.shape[0]):
        topic_id = topic_row['Topic']
        
        # Relevant keywords for the topic
        keywords = topic_row['Representation']

        # Identifying documents for the topic
        relevant_docs_indices = document_info[document_info['Topic'] == topic_id]
        relevant_docs = [documents[i] for i in relevant_docs_indices.index]

        # Extract sentences containing the keywords
        relevant_sentences = []
        for doc in relevant_docs:
            for sentence in doc.split('.'):
                if any(keyword.lower() in sentence.lower() for keyword in keywords):
                    relevant_sentences.append(sentence)

        concatenated_sentences = ' '.join(relevant_sentences)
        topic_sentences.append(concatenated_sentences)

    return topic_sentences


In [88]:
topic_sentences = get_relevant_sentences(model, documents)

  0%|          | 0/157 [00:00<?, ?it/s]

100%|██████████| 157/157 [00:02<00:00, 63.49it/s] 


In [97]:
def generate_extractive_summaries(topic_sentences):
    """
    Generate extractive summaries for each topic using the extractive_summarization function.

    Parameters:
    - topic_sentences: The list of relevant sentences for each topic.
    Returns:
    - The list of extractive summaries for each topic.
    """
    summaries = []

    for topic_sentence in tqdm(topic_sentences, desc="Generating Summaries"):
        summary = extractive_summarization(topic_sentence)
        summaries.append(summary)


    return summaries

In [99]:
# Generate extractive summaries
extractive_summaries = generate_extractive_summaries(topic_sentences)

Generating Summaries: 100%|██████████| 156/156 [07:34<00:00,  2.92s/it]


In [107]:
topic_info = model.get_topic_info()[1:157]
df_extractive_summaries = pd.DataFrame({'Topic': topic_info['Topic'], 'Extractive Summary': extractive_summaries})

In [108]:
print(df_extractive_summaries)

     Topic                                 Extractive Summary
1        0  since China imposed the coronavirus lockdown i...
2        1  this treatment prevents the virus attaching to...
3        2  adaptability is by all means the necessity ski...
4        3  we conducted a rapid systematic review in line...
5        4  yes No The purpose of this form is to provide ...
..     ...                                                ...
152    151  at the onset of the COVID-19 pandemic, it was ...
153    152  legionella, which is widely found in the natur...
154    153  solid organ transplant recipients are at an el...
155    154  just 4 years ago, a virus not named coronaviru...
156    155  the coronavirus disease pandemic has changed l...

[156 rows x 2 columns]


In [109]:
df_extractive_summaries.to_csv('../extractive_summaries.csv', index=False)

In [112]:
from transformers import pipeline
from tqdm import tqdm

def generate_abstractive_summaries(extractive_summaries):
    """
    Applies abstractive summarization on the provided extractive summaries.

    Parameters:
    - extractive_summaries: The list of extractive summaries.
    Returns:
    - The list of abstractive summaries.
    """
    summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
    
    summaries = []

    # Wrap the loop with tqdm for progress tracking
    for summary in tqdm(extractive_summaries, desc="Generating Abstractive Summaries"):
        abstractive_summary = summarizer(summary, min_length=5, max_length=500, truncation=True)
        # The output of summarizer is a list of summaries; extract the summary text.
        summary_text = abstractive_summary[0]['summary_text']
        summaries.append(summary_text)

    return summaries


In [113]:
# Generate abstractive summaries
abstractive_summaries = generate_abstractive_summaries(topic_sentences)

Generating Abstractive Summaries: 100%|██████████| 156/156 [24:44<00:00,  9.52s/it]


In [121]:
df_abstractive_summaries = pd.DataFrame({'Topic': topic_info['Topic'], 'Abstractive Summary': abstractive_summaries})
print(df_abstractive_summaries)

     Topic                                Abstractive Summary
1        0  China imposed the coronavirus lockdown in the ...
2        1  The impact of Casirivimab-Imdevimab on SARS-Co...
3        2  CJSMTE's Volume 20 Issue 2, the Special Theme ...
4        3  In early 2020, the emerging respiratory virus ...
5        4  The purpose of this form is to provide readers...
..     ...                                                ...
152    151  At the onset of the COVID-19 pandemic, it was ...
153    152  MNGS technology based on high-throughput seque...
154    153  All kidney transplant recipients at our center...
155    154  ZIKV has not impacted as many lives as SARS-Co...
156    155  The first Covid-19 listed studies with pediatr...

[156 rows x 2 columns]


In [122]:
df_abstractive_summaries.to_csv('../abstractive_summaries.csv', index=False)

In [123]:
df_abstractive_summaries = pd.read_csv('../abstractive_summaries.csv')

Topic summarization

In [27]:
from summarizer import Summarizer

# Load the pre-trained BERT summarizer model
bert_summarizer = Summarizer()

In [44]:
from tqdm import tqdm

# Create a function to generate summaries for each topic
def generate_topic_summaries(topics, documents, num_sentences=5):
    topic_summaries = {}

    # Use tqdm to add a progress bar
    for topic_id in tqdm(set(topics), desc="Generating Summaries", unit="topic"):
        # Extract documents for the current topic
        topic_documents = [doc for doc, t in zip(documents, topics) if t == topic_id]

        print(len(topic_documents))

        # Concatenate the documents to form a single text
        topic_text = ' '.join(topic_documents)

        # Generate summary using the BERT summarizer
        summary = bert_summarizer(topic_text, num_sentences=num_sentences)

        # Store the summary for the current topic
        topic_summaries[topic_id] = summary

    return topic_summaries

In [25]:
topics = model.get_topics()
concatenated_df = pd.read_csv('../concatenated.csv')
documents = concatenated_df['text'].tolist()

In [45]:
# Generate summaries for each topic
topic_summaries = generate_topic_summaries(topics, documents)

Generating Summaries:   0%|          | 0/157 [00:00<?, ?topic/s]

1


Generating Summaries:   0%|          | 0/157 [00:05<?, ?topic/s]


KeyboardInterrupt: 

In [64]:
# Create the combined DataFrame in one line
df_summaries = pd.DataFrame(list(topic_summaries.items()), columns=['TopicID', 'Summary']).assign(Label=labels)[['TopicID', 'Label', 'Summary']]

In [33]:
print(df_summaries)

     TopicID                              Label  \
0          0         pandemic_covid19_infection   
1          1       pandemic_coronavirus_covid19   
2          2  coronavirus_coronaviruses_covid19   
3          3     classroom_students_educational   
4          4       aerosol_aerosols_respiratory   
..       ...                                ...   
152      152      coronavirus_covid19_pneumonia   
153      153     pneumonia_infections_pathogens   
154      154       vaccines_vaccine_vaccination   
155      155       flaviviruses_flavivirus_zika   
156       -1            covid19_fever_pneumonia   

                                               Summary  
0    singultus hiccups is a common anatomical and p...  
1    die Unterschiede zwischen den Regionen Deutsch...  
2    accepted 10 January 1992 is a neurotropic coro...  
3    natural selection has generated a finely tuned...  
4    for at least 25 years, veterinary practitioner...  
..                                           

In [66]:
# Save the DataFrame to a CSV file
df_summaries.to_csv('../topic_summaries.csv', index=False)

In [32]:
df_summaries = pd.read_csv('../topic_summaries.csv')

Attempt with BART (ignore)

In [16]:
from transformers import BartTokenizer, BartForConditionalGeneration

In [17]:
# Initialize BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [18]:
# Function for extractive summarization using BART
def generate_summary(document):
    inputs = tokenizer.encode("summarize: " + document, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [19]:
import pandas as pd

df = pd.read_csv('../concatenated.csv')

In [25]:
from tqdm import tqdm

In [26]:
# Apply the summarization function to each document with a progress bar
tqdm.pandas(desc="Summarizing")
df['summary'] = df['text'].progress_apply(generate_summary)

Summarizing:   0%|          | 8/10000 [01:09<24:10:11,  8.71s/it]


KeyboardInterrupt: 

Accidentally summarized all of the documents instead of just the documents pertaining to each of the topics

In [28]:
from transformers import BertTokenizer, BertModel
import pandas as pd

In [29]:
df = pd.read_csv('../concatenated.csv')

In [30]:
documents = df['text'].tolist()

In [40]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [43]:
import torch
from tqdm import tqdm

# Function for extractive summarization using BERT embeddings
def generate_summary(document):
    # Tokenize and get embeddings
    inputs = tokenizer(document, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Use the [CLS] embedding as the summary
    summary_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return summary_embedding

In [48]:
# Apply the summarization function to each document
tqdm.pandas()
df['summary_embedding'] = df['text'].progress_apply(generate_summary)

100%|██████████| 10000/10000 [1:50:33<00:00,  1.51it/s] 


In [49]:
# Save the summarized data to a new CSV file
df.to_csv('../summarized.csv', index=False)

In [23]:
for topic_id, summary in topic_summaries.items():
    print(f"Topic {topic_id} Summary:")
    print(summary)
    print("\n")

Topic TopicID Summary:
0        0
1        1
2        2
3        3
4        4
      ... 
152    152
153    153
154    154
155    155
156     -1
Name: TopicID, Length: 157, dtype: int64


Topic Label Summary:
0             pandemic_covid19_infection
1           pandemic_coronavirus_covid19
2      coronavirus_coronaviruses_covid19
3         classroom_students_educational
4           aerosol_aerosols_respiratory
                     ...                
152        coronavirus_covid19_pneumonia
153       pneumonia_infections_pathogens
154         vaccines_vaccine_vaccination
155         flaviviruses_flavivirus_zika
156              covid19_fever_pneumonia
Name: Label, Length: 157, dtype: object


Topic Summary Summary:
0      singultus hiccups is a common anatomical and p...
1      die Unterschiede zwischen den Regionen Deutsch...
2      accepted 10 January 1992 is a neurotropic coro...
3      natural selection has generated a finely tuned...
4      for at least 25 years, veterinary practit

Experimental

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_ctfidf_for_topics(model, documents):
    # Step 1: Initialize TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Step 2: Calculate c-TF-IDF for each topic
    ctfidf_per_topic = {}

    for topic_id in range(max(topics) + 1):
        # Get documents associated with the current topic
        topic_documents = [documents[i] for i, t in enumerate(topics) if t == topic_id]

        # Calculate TF-IDF values for the words in the topic
        tfidf_matrix = vectorizer.fit_transform(topic_documents)
        tfidf_values = tfidf_matrix.sum(axis=0)

        # Get the feature names (words)
        feature_names = vectorizer.get_feature_names_out()

        # Create a dictionary of word to c-TF-IDF values
        topic_ctfidf = {feature_names[i]: tfidf_values[0, i] for i in range(len(feature_names))}

        # Store the c-TF-IDF values for the current topic
        ctfidf_per_topic[topic_id] = topic_ctfidf
    return ctfidf_per_topic

In [46]:
ctfidf_results = calculate_ctfidf_for_topics(model, documents)

In [47]:
# c-TF-IDF values for each topic
for topic_id, ctfidf_values in ctfidf_results.items():
    print(f"Topic {topic_id} c-TF-IDF values:")
    print(ctfidf_values)

Topic 0 c-TF-IDF values:
{'04': 0.006722318814753355, '10': 0.01344463762950671, '11': 0.006722318814753355, '12': 0.01344463762950671, '13': 0.006722318814753355, '14': 0.020166956444260066, '15': 0.01344463762950671, '16': 0.006722318814753355, '161': 0.006722318814753355, '17': 0.006722318814753355, '18': 0.006722318814753355, '19': 0.09411246340654697, '20': 0.006722318814753355, '2012': 0.006722318814753355, '2017': 0.006722318814753355, '2019': 0.006722318814753355, '2020': 0.020166956444260066, '2021': 0.006722318814753355, '21': 0.01344463762950671, '40': 0.006722318814753355, '48': 0.01344463762950671, '500': 0.01344463762950671, '64': 0.006722318814753355, '94': 0.006722318814753355, 'abnormal': 0.006722318814753355, 'about': 0.01344463762950671, 'accessory': 0.006722318814753355, 'accordingly': 0.006722318814753355, 'aches': 0.006722318814753355, 'acid': 0.006722318814753355, 'activities': 0.006722318814753355, 'acute': 0.04033391288852013, 'addition': 0.01344463762950671, '

In [38]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def extract_representative_documents(model, documents):
    # Get the c-TF-IDF representation of topics
    c_tf_idf = model.transform(documents)

    # Get the topic assignments for each document
    topics = model.get_topics()

    # Initialize an empty dictionary to store representative documents for each topic
    representative_docs = {}

    # Use tqdm to add a progress bar
    for topic_id in tqdm(set(topics), desc="Extracting Representative Documents", unit="topic"):
        # Filter documents belonging to the current topic
        topic_documents = [doc for doc, t in zip(documents, topics) if t == topic_id]

        # Calculate the c-TF-IDF representation for the topic's documents
        topic_c_tf_idf = model.transform(topic_documents)

        # Calculate cosine similarity between the topic's c-TF-IDF representation and all documents
        similarity_matrix = cosine_similarity(topic_c_tf_idf, c_tf_idf)

        # Find the document with the highest similarity (representative document)
        representative_index = np.argmax(similarity_matrix.sum(axis=0))
        representative_doc = documents[representative_index]

        # Store the representative document for the current topic
        representative_docs[topic_id] = representative_doc

    return representative_docs

In [49]:
# Instantiate the BERT extractive summarizer
summarizer = Summarizer()