In [1]:
import sys
import os
from  pathlib import  Path
sys.path.append(str(Path.cwd().parent))
from  config import OPENAI_API_KEY,NCBI_API_KEY,EMAIL
from  src.clinfoai.pubmed_engine import PubMedNeuralRetriever

# Using Clinfo.AI 

In this tutorial, we will go through each step of the Clinfo.AI workflow. Before we start, we need to set up a few things. 


### 1.- Setting up enviorment:
1.a.- Install the conda enviroment using the yml file provided.

``` conda env create -f environment.yaml ```

1.b.- Select your environment to run notebook. I recommend using VScode: 



### 2.- Creating Accounts

You will need at least one account and at most two (depending on how many calls/hour you plan to do):
* OPENAI account: If you start a free account for the first time, you will get $5 in API credits.
* NCBI_API_KEY: This is only necessary if you plan to make more than 10 calls per hour.


Once you have created both accounts  go to **src\config.py** file and: 

* Set OPENAI_API_KEY to your openAI API key

If you created an NCBI API account add your key and email in the following values: 
* NCBI_API_KEY 
* EMAIL
* 
Otherwise, leave them as None

In [2]:
# Make Sure you followed at least step 1-2 before running this cell.
from  config import OPENAI_API_KEY, NCBI_API_KEY, EMAIL
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


### 3.- Defining your own promts:
We have designed prompts for each step of Clinfo.ai Workflow, leaveriging the power of in-contex-learning. If you want to us your own promps you can edit them **src\prompts** otherwise we will use the default prompts:

In [3]:
PROMPS_PATH = os.path.join("..", "src","clinfoai","prompts","PubMed","Architecture_1","master.json")
current_directory = os.getcwd()
print(current_directory)

/Users/nanh/Documents/RMIT/Capstone/Untitled/Clinfo.AI/notebooks


### 4.- Define Clinfo.AI LLM Backbone
Clinfo uses a chain of LLMs to summarize information, thus we need to define an LLM backbone. 

We will start with OpenAI models, however, if you have access to GPUs it is possible to use Clinfo.AI with vLLM to use OpenSource LLMs as backbones (check tutorial 3).

In [4]:
MODEL:str  = "gpt-3.5-turbo"
#MODEL:str = "Qwen/Qwen2-beta-7B-Chat"

### 5.- Init Clinfo+Pubmed Engnie
We have all the necessary data to start our clinfo+pubmed instance:

In [5]:
## 5.- Init Neural Retriever from path. 
# Do not change the path if you want to use base  prompts, otherwise specify your own prompt architecture

nrpm = PubMedNeuralRetriever(
    architecture_path = PROMPS_PATH,
    model             = MODEL,
    verbose           = False,
    debug             = False,
    open_ai_key       = OPENAI_API_KEY,
    email             = EMAIL)


Task Name: pubmed_query_prompt
------------------------------------------------------------------------

Task Name: relevance_prompt
------------------------------------------------------------------------

Task Name: summarization_prompt
------------------------------------------------------------------------

Task Name: synthesize_prompt
------------------------------------------------------------------------


# Let's start!

In [6]:
### Step 0 : Ask a question ###
QUESTION    = "What is the prevalence of COVID-19 in the United States?"
QUESTION    = "What tests are needed to diagnose Chronic Neutropenia?"


## STEP 1 (Search PubMed): Convert the question into a query using an LLM
# This returns a list of queries (containing MESH terms)
# These queries are used to retrieve articles from NCBI
# Once retrieved we collect a list article ids.
pubmed_queries, article_ids = nrpm.search_pubmed(
    question=QUESTION,
    num_results=10,
    num_query_attempts=1)

print(f"Articles retrived: {len(article_ids)}")
print(pubmed_queries)
print(article_ids)

  warn_deprecated(
  warn_deprecated(


Articles retrived: 10
['("Chronic Neutropenia" AND "diagnosis" AND "tests")']
['30870474', '19305028', '27841775', '3534197', '10388004', '20301576', '11964321', '34303547', '24827415', '6602565']


In [7]:
## Step 2: Fetch article data
# Preiously, we only extracted he PMIDs. No we will use those  PMIDs to retrive the metadata:
articles = nrpm.fetch_article_data(article_ids)

# Print example for first article: 
article_num = 1
print(f"Article {article_num}:\n")

#print(articles[article_num].keys())
#print(articles[article_num]['PubmedData'])
print(articles[article_num]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"])
#print(articles[article_num]["MedlineCitation"]["Article"])


Article 1:

['Chronic neutropenia is a decrease in circulating neutrophils in the peripheral blood lasting over 6 months. Values need to be refered with the age and race. In children aged 2 weeks to 12 months reffered values are above 1000/03BCL. There are congenital and aquired reasons of neutropenia in infancy. The most common type of chronic neutropenia in infants is chronic, benign neutropenia (AIN). Authors present ten infants between three and six months with chronic, benign neutropenia. The reason of ordering laboratory tests at outpatient clinic were benign upper respiratory tract infections (four cases), pallor (four cases) and on parental demand (one case). In one infant neutropenia was observed during treatment of pneumonia at a district hospital.']


In [8]:
# STEP 3 Summarize each article
# This step is parallelized, though it might look like one single call, it performs one call per article to summarize.
# Then the relevancy of the article (based on the original question) is provided by another LLM call.

article_summaries,irrelevant_articles =  nrpm.summarize_each_article(articles, QUESTION)

In [9]:
# Summaries for relevant articles
article_summaries

[{'title': '[Neutropenia in infancy - sometimes chronic and benign - own experiences].',
  'url': 'https://pubmed.ncbi.nlm.nih.gov/19305028/',
  'abstract': 'Chronic neutropenia is a decrease in circulating neutrophils in the peripheral blood lasting over 6 months. Values need to be refered with the age and race. In children aged 2 weeks to 12 months reffered values are above 1000/03BCL. There are congenital and aquired reasons of neutropenia in infancy. The most common type of chronic neutropenia in infants is chronic, benign neutropenia (AIN). Authors present ten infants between three and six months with chronic, benign neutropenia. The reason of ordering laboratory tests at outpatient clinic were benign upper respiratory tract infections (four cases), pallor (four cases) and on parental demand (one case). In one infant neutropenia was observed during treatment of pneumonia at a district hospital.',
  'citation': 'Kaczorowska-Hać B, Wierzba J, Stefanowicz J, Sielachowicz K, Wlazłowsk

In [10]:
# Articles deemed irelevant
irrelevant_articles

[{'title': 'Invasive aspergillosis and endocarditis.',
  'url': 'https://pubmed.ncbi.nlm.nih.gov/34303547/',
  'abstract': 'INTRODUCTION:\nAspergillusfumigatus can cause a systemic infection called invasive aspergillosis causing pulmonary and extra-pulmonary damage. Aspergillus endocarditis (AE) is a relatively rare disease but can be life-threatening.\n\nCASE REPORTS:\nWe report here on five cases of endocarditis due to invasive aspergillosis: a 58-year-old man receiving immunosuppressive medication following a kidney graft, a 58-year-old man undergoing chemotherapy for chronic lymphocytic leukaemia, a 55-year-old man receiving corticosteroids for IgA vasculitis, a 52-year-old HIV-infected woman under no specific treatment and a 17-year-old boy under immunosuppressive therapy for auto-immune chronic neutropenia.\n\nDISCUSSION:\nAspergillus accounts for 25-30% of fungal endocarditis and 0.25% to 8.5% of all cases of infectious endocarditis. Aspergillus endocarditis results from invasio

In [11]:
# STEP 4 do a synthesis of all summaries to answer question: 
synthesis =   nrpm.synthesize_all_articles(article_summaries, QUESTION)
print("synthesis")
print(synthesis)

synthesis
Literature Summary: Chronic neutropenia presents a diagnostic challenge, with studies highlighting various aspects of the condition. Retrospective analysis of 240 pediatric cases with chronic neutropenia positive for anti-neutrophil antibodies found that reduced colony-forming units-granulocyte-macrophage (CFU-GM) and hematopoietic progenitors were associated with severe infections and delayed remission. Another study of 41 children emphasized the association between antineutrophil antibody strength at diagnosis and the age of recovery. Autoimmune neutropenia of infancy was characterized by neutrophil antibodies, selective neutropenia, and myeloid hyperplasia, with recovery typically by age 5. A case series of infants with benign neutropenia highlighted various clinical presentations and the need for evaluation. While antineutrophil antibody testing is uncertain in chronic idiopathic neutropenia, treatment with granulocyte colony-stimulating factor (G-CSF) can effectively inc

# Great! We answered our first question using Clinfo.AI!
## Here are all the steps condensed:

In [13]:
PROMPS_PATH  = os.path.join("..","src","clinfoai","prompts","PubMed","Architecture_1","master.json")
MODEL:str    = "gpt-3.5-turbo"
# MODEL:str    = "Qwen/Qwen2-beta-7B-Chat"

nrpm = PubMedNeuralRetriever(
    architecture_path = PROMPS_PATH,
    model             = MODEL,
    verbose           = False,
    debug             = False,
    open_ai_key       = OPENAI_API_KEY,
    email             = EMAIL)

### STEP 0: Ask a question ###
question    = "What is the prevalence of COVID-19 in the United States?"

### STEP 1: Search PubMed ###
pubmed_queries, article_ids = nrpm.search_pubmed(question,num_results=10,num_query_attempts=1)

### STEP 2: Fetch article data ###
articles = nrpm.fetch_article_data(article_ids)

### STEP 3 Summarize each article (only if they are relevant [Step 3]) ###
article_summaries,irrelevant_articles =  nrpm.summarize_each_article(articles, question)


### STEP 4: Synthesize the results ###
synthesis =   nrpm.synthesize_all_articles(article_summaries, question)
print("Result:")
print(synthesis)


Task Name: pubmed_query_prompt
------------------------------------------------------------------------

Task Name: relevance_prompt
------------------------------------------------------------------------

Task Name: summarization_prompt
------------------------------------------------------------------------

Task Name: synthesize_prompt
------------------------------------------------------------------------
Result:
Literature Summary: The prevalence of COVID-19 in the United States has been reported in various studies. A cross-sectional nationwide survey involving 5,203 adults in the US found a prevalence of 1.7% [1]. Another study assessing COVID-19 testing and incidence among 6,342,455 veterans who utilized VA services reported a 20.4% positivity rate within this population [5]. Additionally, a modeling study across all 50 US states and the District of Columbia estimated the infection fatality ratio to be approximately 0.6-0.7% [3]. These findings suggest variations in COVID-19 