In [12]:
from transformers import pipeline
pipe = pipeline("text2text-generation", model="p208p2002/bart-squad-qg-hl")

In [3]:
import re
import pandas as pd
import requests
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
def get_entities(texts):

    entities = []
    for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts)):
        ent1 = ent2 = ""
        prv_tok_dep = prv_tok_text = ""
        compound_or_modifier = ""

        for tok in doc:
            if tok.dep_ == "punct":
                continue  # Skip punctuation tokens

            # Handle compound or modifier tokens
            if tok.dep_ == "compound" or tok.dep_.endswith("mod"):
                compound_or_modifier = f"{prv_tok_text + ' ' if prv_tok_dep == 'compound' else ''}{tok.text}"

            # Entity 1: subject
            if "subj" in tok.dep_:
                ent1 = f"{compound_or_modifier} {tok.text}".strip()
                compound_or_modifier = ""  # Reset after use

            # Entity 2: object
            if "obj" in tok.dep_:
                ent2 = f"{compound_or_modifier} {tok.text}".strip()

            # Update previous token variables
            prv_tok_dep, prv_tok_text = tok.dep_, tok.text

        entities.append([ent1, ent2])

    return entities

split summaries into sentences

In [4]:
df = pd.read_csv('../abstractive_summaries.csv')
df['index']=df.index
df

Unnamed: 0,0,index
0,The World Health Organization announced the No...,0
1,Cancer is frequent in dogs and is by far their...,1
2,15 ARTDs have been linked previously to restri...,2
3,We found that granule cell neurons GCN of the ...,3
4,CJPH social media aims to engage our followers...,4
5,"iBS etiology is still not well understood, how...",5
6,HF is an emerging epidemic with more than 26 m...,6
7,nitrogen dioxide NO2 is an important chemical ...,7
8,According to the importance of rapid tests in ...,8
9,professional burnout is an important issue for...,9


In [5]:
sentences = []
for index, row in df.iterrows():
    topic_sentences = sent_tokenize(row['0'])
    for sentence in topic_sentences:
        sentences.append({'index': row['index'], 'Sentence': sentence})

# Creating a new dataframe with topic IDs and individual sentences
df_sentences = pd.DataFrame(sentences)

# Example to display the first few rows of the new dataframe
df_sentences

Unnamed: 0,index,Sentence
0,0,The World Health Organization announced the No...
1,0,Since the WHO has warned about the beginning o...
2,0,healthcare workers are experiencing unpreceden...
3,0,The study followed a group of workers who were...
4,0,The main stressors were prolonged periods of w...
...,...,...
541,32,Semiconductor-induced photocatalysis has attra...
542,32,Leachate promotes surface reactions between th...
543,32,most studies have dealt with the degradation o...
544,32,Leachate is a form of organic acid.


extract entities and relationships from each sent

In [17]:
df_sentences.to_csv('../summaries_sent.csv', index=False)

In [6]:
df_sentences['Entities'] = get_entities(df_sentences['Sentence'].tolist())

100%|██████████| 546/546 [00:01<00:00, 345.17it/s]


we could do llm, this performs much better than automated extraction

In [31]:
# # Example: reuse your existing OpenAI setup
# from openai import OpenAI

# # Point to the local server
# client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# completion = client.chat.completions.create(
#   messages=[
#     {"role": "system", "content": "You are an expert in extracting key information from a sentence. You must extract all entities and relationship mentioned in a sentence. Your response must be the exact entities and relationship. Your response must only contain the entities and relationship, separated by a comma."},
#     {"role": "user", "content": "Extract all entities and relationships from this sentence: 'Influenza vaccination should be administered seasonally to all non-allergic women contemplating a new pregnancy.'"}
#   ],
#   model="mistral-7b-instruct-v0.1.Q5_0.gguf",
#   temperature=0.7,
# )

# print(completion.choices[0].message.content)


100%|██████████| 488/488 [00:00<00:00, 505.03it/s]


tag the extracted info with [HL]

In [7]:
df_sentences

Unnamed: 0,index,Sentence,Entities
0,0,The World Health Organization announced the No...,"[World Health Organization, pandemic March]"
1,0,Since the WHO has warned about the beginning o...,"[equipment protocols, medical institutions]"
2,0,healthcare workers are experiencing unpreceden...,"[healthcare workers, enormous workload]"
3,0,The study followed a group of workers who were...,"[who, Central Italy]"
4,0,The main stressors were prolonged periods of w...,"[main stressors, physical activity]"
...,...,...,...
541,32,Semiconductor-induced photocatalysis has attra...,"[significant it, organic pollutants]"
542,32,Leachate promotes surface reactions between th...,"[Leachate, adsorbed ozone]"
543,32,most studies have dealt with the degradation o...,"[most studies, single leachate]"
544,32,Leachate is a form of organic acid.,"[Leachate, organic acid]"


In [8]:
def tag_entity_or_relation(sentence, answer):
    tagged_sentence = sentence.replace(answer, f"[HL]{answer}[HL]")
    return tagged_sentence

In [9]:
question_data = []
for index, row in df_sentences.iterrows():
    topic_id = row['index']
    sentence = row['Sentence']
    entities = row['Entities']  # Assuming this is a list of entities

    # Tagging and adding entities
    for entity in entities:
        if entity:  # Check if entity is not empty
            tagged_sentence = tag_entity_or_relation(sentence, entity)
            question_data.append({'Topic': topic_id, 'TaggedSentence': tagged_sentence, 'Answer': entity})

# Convert the prepared data into a dataframe
df_questions = pd.DataFrame(question_data)


In [13]:
tqdm.pandas()
def apply_pipe(text):
    output = pipe(text)
    return output[0]['generated_text']

generate zero shot questions

In [14]:
df_questions['Question'] = df_questions['TaggedSentence'].progress_apply(apply_pipe)
df_questions

100%|██████████| 1044/1044 [06:06<00:00,  2.85it/s]


Unnamed: 0,Topic,TaggedSentence,Answer,Question
0,0,The [HL] World Health Organization [HL] announ...,World Health Organization,Who announced the Novel Coronavirus SARS-CoV-2...
1,0,The World Health Organization announced the No...,pandemic March,What did the World Health Organization announc...
2,0,Since the WHO has warned about the beginning o...,equipment protocols,"Since the beginning of the pandemic in March, ..."
3,0,Since the WHO has warned about the beginning o...,medical institutions,Where have basic protective equipment and safe...
4,0,[HL] healthcare workers [HL] are experiencing ...,healthcare workers,Who are experiencing pressure from stressors?
...,...,...,...,...
1039,32,most studies have dealt with the degradation o...,single leachate,What studies have dealt with degradation of si...
1040,32,[HL] Leachate [HL] is a form of organic acid.,Leachate,What is a form of organic acid?
1041,32,Leachate is a form of [HL] organic acid [HL].,organic acid,What is leachate a form of?
1042,32,[HL] It [HL] is produced by the decomposition ...,It,How is water produced?


In [15]:
df_questions.to_csv('../question_answer_pair_3.csv', index=False)