# CCI Literature Analysis

This notebook queries PubMed Central to extract full-text XML files of articles on a pre-defined list. After preprocessing the text and extracting paragraphs that reference one of ten Charlson Comorbidity Index versions we are interested in, each paragraph is analyzed with a Large Language Model using the Azure AI Foundry endpoint to extract the relevant references. 

Author: Josh Fuchs

Copyright 2025, The University of North Carolina at Chapel Hill. Permission is granted to use in accordance with the MIT license. The code is licensed under the open-source MIT license.

This software uses the Entrez Programming Utilities available from the National Center for Biotechnology Information (NCBI). Please carefully review the NCBI's Disclaimer and Copyright notice at https://www.ncbi.nlm.nih.gov/home/about/policies/ before use. 

In [0]:
from Bio import Entrez
from lxml import etree

import urllib 
import os
import sys
import re
import string
import configparser

import numpy as np
import pandas as pd
import random 
from copy import deepcopy
from collections import Counter

import xml.dom.minidom
from bs4 import BeautifulSoup, element

from azure.ai.inference import ChatCompletionsClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.inference.models import AssistantMessage, SystemMessage, UserMessage

## Define Variables

In [0]:
# Load Endpoint and API keys from config.ini

# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the configuration file
config.read('config.ini')

# Access values from the configuration file
endpoint_url = config.get('azure_foundry', 'endpoint_url')
azure_key = config.get('azure_foundry', 'key')

entrez_key = config.get('entrez','entrez_api_key')

In [0]:
# Define the search criteria: 
# each element of the list is a different version of CCI
# we are searching for.
search_criteria = [
    {
        'title': 'Updating and validating the Charlson Comorbidity Index and Score for risk adjustment in hospital discharge abstracts using data from 6 countries',
        'year': '2011',
        'surname': 'Quan'
    },
    {
        'title': 'Coding algorithms for defining comorbidities in ICD-9-CM and ICD-10 administrative data',
        'year': '2005',
        'surname': 'Quan'
    },
    {
        'title': 'New ICD-10 version of the Charlson comorbidity index predicted in-hospital mortality',
        'year': '2004',
        'surname': 'Sundararajan'
    },
    {
        'title': 'Improved Comorbidity Adjustment for Predicting Mortality in Medicare Populations',
        'year': '2003',
        'surname': 'Schneeweiss'
    },
    {
        'title': 'Measuring potentially avoidable hospital readmissions',
        'year': '2002',
        'surname': 'Halfon'
    },
    {
        'title': 'Development of a comorbidity index using physician claims data',
        'year': '2000',
        'surname': 'Klabunde'
    },
        {
        'title': 'Validation of a combined comorbidity index',
        'year': '1994',
        'surname': 'Charlson'
    },
    {
        'title': 'Adapting a clinical comorbidity index for use with ICD-9-CM administrative data: differing perspectives',
        'year': '1993',
        'surname': 'Romano'
    },
    {
        'title': 'Adapting a clinical comorbidity index for use with ICD-9-CM administrative databases',
        'year': '1992',
        'surname': 'Deyo'
    },
    {
        'title': 'A new method of classifying prognostic comorbidity in longitudinal studies: development and validation',
        'year': '1987',
        'surname': 'Charlson'
    }
]


In [0]:
def format_references_for_search(criteria: list[dict]) -> list[str]:
    '''
    Formats each CCI version paper as one of the following. This accounts for varieties
    in how references are formatted in the body of the text, so that we can automate the text searching
    while accounting for varieties in how references appear:
    Charlson 1987
    Charlson et al. 1987
    Charlson et al. (1987)
    Charlson et\xa0al., 1987
    Charlson et\xa0al. 1987
    Charlson et al., 1987
    Charlson et\xa0al. (1987)
    Charlson et al.\xa0(1987)
    Charlson et al, 1987
    Charlsonetal., 1987

    PARAMETERS
    criteria : list of dictionaries
        each element of the list is a different major version of Charlson
        each dictionary should contain the title, year, and surname of first author
    
    OUTPUT
    result : list 
        formatted list of strings for each major version and different format

    '''
    result = []
    for item in criteria:
        result.append(f"{item['surname']} {item['year']}")
        result.append(f"{item['surname']} et al. {item['year']}")
        result.append(f"{item['surname']} et al. ({item['year']})")
        result.append(f"{item['surname']} et\xa0al., {item['year']}")
        result.append(f"{item['surname']} et\xa0al. {item['year']}")
        result.append(f"{item['surname']} et al., {item['year']}")
        result.append(f"{item['surname']} et\xa0al. ({item['year']})")
        result.append(f"{item['surname']} et al.\xa0({item['year']})")
        result.append(f"{item['surname']} et al, {item['year']}")
        result.append(f"{item['surname']}et al., {item['year']}")
    return result    

In [0]:
# Generate the list of CCI version references we will search
# for. We will search each paragraph for these terms to keep for analysis. 
# Explicitly searching for the references also helps us to identify and analyze papers
# that reference a CCI version, but do not include it in the list of references. 
# See PMC8505350 for an example of this, which is rare.  

# Format variations for references
charlson_ref_versions = format_references_for_search(search_criteria)

# Other very unusual reference formatting
# From PMCID: PMC7063690
charlson_unusual = ['Charlson, Pompei, Ales, & MacKenzie, 1987']

charlson_search_terms = charlson_ref_versions + charlson_unusual

## Extract article as XML From PMC

In [0]:
def fetchPMCxml(pmc_id: str,entrez_key:str) -> BeautifulSoup:
    '''
    Uses the Entrez module in BioPython to access the PMC database. Articles are returned as xml files, then
    converted into BeautifulSoup objects. If the query to PMC does not return an article, the function returns None.

    Email is hard-coded in. Email is required from the NCBI. 
    API key allows up to 10 queries/sec, as opposed to 3 without.

    See https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/ for details about
    getting an API key for the NCBI. 

    PARAMETERS
    pmc_id : string
        string that is the PMC ID formatted like "9651183"
    
    entrez_key : string
        API key for NCBI to allow up to 10 queries/sec

    RETURNS
    soup : BeautifulSoup object
        object that contains the article
    
    Returns None if query to PMC fails.    
    '''
    Entrez.email = "YOUR_EMAIL@MAIL.EDU" # you must give NCBI an email address
    Entrez.api_key = entrez_key 
    
    try:
        fetchHandle = Entrez.efetch(db="pmc", retmax=1, retmode="xml", id=pmc_id)
        data = fetchHandle.read()
        fetchHandle.close()
        soup = BeautifulSoup(data,"xml")
        return soup
    except:
        return None


## Find CCI References

In [0]:
def compare_strings(article_title: str, criteria_title: str) -> bool:
    """
    Compares two strings and returns True if at least 80% of the words in article_title are in criteria_title.
    This helps confirm that the titles we are looking for are actually the titles, while accounting for
    typos in the title, so we don't have to require an exact match.

    PARAMETERS
    article_title : string
        title of the reference in the paper being analyzed

    criteria_title : string
        title of the Charlson version paper being compared to

    RETURNS
        boolean
        True if at least 80% of the words match
        Otherwise False
    """
    # Split the titles into words
    article_words_punctuation = article_title.lower().split()
    criteria_words_punctuation = criteria_title.lower().split()

    # Remove leading and trailing punctuation from each word
    article_words = [word.strip(string.punctuation) for word in article_words_punctuation]
    criteria_words = [word.strip(string.punctuation) for word in criteria_words_punctuation]

    # Count the number of words in article_title that are in criteria_title
    matching_words = sum(1 for word in article_words if word in criteria_words)
    
    # Calculate the percentage of matching words
    # If there are no words in article_title, return 0.0
    if len(article_words) == 0:
        match_percentage = 0.0
    else:
        match_percentage = matching_words / len(article_words)
    
    # Return True if at least 80% of the words match, otherwise False
    return match_percentage >= 0.8

In [0]:
def find_charlson_references(soup: BeautifulSoup,search_criteria: list[dict]) -> dict[int, str]:
    """
    Parses an XML document represented by a BeautifulSoup object to find references
    and matches them against a list of search criteria. The matching references are returned as a dictionary
    with reference labels as keys and formatted reference strings as values.

    PARAMETERS
    soup : BeautifulSoup object
        A BeautifulSoup object representing the parsed XML document.
    
    search_criteria : list of dict
        A list of dictionaries containing search criteria. Each dictionary should have the keys:
            - 'title' (str): The title of the reference to match.
            - 'year' (str): The year of the reference to match.
            - 'surname' (str): The surname to include in the formatted reference string.

    RETRUNS
    charlson_references : dict
        A dictionary where the keys are reference labels (int) and the values are formatted reference strings (str).
    """

    # Initialize an empty dictionary to store the charlson_references
    charlson_references = {}

    # Find all references
    refs = soup.find_all('ref')

    # Iterate through each reference
    for ref in refs:
        # Try to find the label tag first
        label_tag = ref.find('label')

        if label_tag:
            label = label_tag.text
            # Remove non-digits from the label
            if isinstance(label, str):
                label = ''.join(filter(str.isdigit, label))
        else:
            # If label tag is not found, use the id attribute
            # to format the label based on how the id is formatted
            label = ref.get('id')
            if label and 'bib-' in label:
                label = label.split('bib-', 1)[1]
            elif label and label.startswith('bibr'):
                # for references formatted as bibr1-23337214241284181
                label = re.search(r'bibr(\d+)-', label).group(1)
            elif label and label.startswith(('bib','BIB','ref','REF','CIT','cit')):
                label = label[3:]
            elif label and label.startswith(('bb','CR')):
                label = label[2:]
            elif label and label.startswith(('B','C','b','c','R','r')):
                label = label[1:]

        article_title_tag = ref.find('article-title')
        ref_year_tag = ref.find('year')
        
        # Check if the article_title_tag and ref_year_tag are not None
        if article_title_tag and ref_year_tag:
            article_title = article_title_tag.text
            ref_year = ref_year_tag.text
            
            # Check if the title and year match any of the given criteria
            for criteria in search_criteria:
                # we'll implement a criteria that at least 80% of the words in the title need to match
                # this will account for instances of titles missing a word or two
                if compare_strings(article_title, criteria['title']) and ref_year == criteria['year']:
                    charlson_references[int(label)] = f"{criteria['surname']} {criteria['year']}"
        else:
            # Some references combine authors and title into a single tag called mixed-citation publication-type
            mixed_citation_tag = ref.find('mixed-citation')
            if mixed_citation_tag:
                # Extract the year and title from the mixed-citation tag
                mixed_citation_text = mixed_citation_tag.text
                xml_year = mixed_citation_tag.find('year').text if mixed_citation_tag.find('year') else None
                xml_title = mixed_citation_text#.split('.')[1].strip() if '.' in mixed_citation_text else None

                # Check if the year and title match any of the given criteria
                for criteria in search_criteria:
                    if xml_title and xml_year and criteria['title'] in xml_title and criteria['year'] == xml_year:
                        charlson_references[int(label)] = f"{criteria['surname']} {criteria['year']}"

    # Return the dictionary of matching charlson_references
    return charlson_references

## Search text for references and replace

Function to search the text for each CCI version reference and replaces the XML reference with the text reference. This provides reformatting to enable paragraph selection and data extraction from the LLM. 

In [0]:
def replace_references_in_body(soup: BeautifulSoup, references: dict[int,str]) -> BeautifulSoup:
    """
    This function processes an XML document represented by a BeautifulSoup object to find and replace
    reference tags (`xref`) and formatted references (e.g., [2]) in the body text with corresponding
    reference strings from a given dictionary. The updated XML document is returned.

    PARAMETERS
    soup : BeautifulSoup object
        A BeautifulSoup object representing the parsed XML document.
    references : dict
        A dictionary where the keys are reference numbers (int) and the values are formatted reference strings (str).

    RETURNS
    updated_soup : BeautifulSoup object
        A BeautifulSoup object representing the updated XML document with replaced references.
    """

    # Create a copy of the soup so you don't modify the original
    soup_copy = deepcopy(soup)
    # Find all xref tags in the body of the XML document
    xrefs = soup_copy.find_all('xref')
    updated_soup = None

    # If xrefs is empty, that most likely means the references are embedded as [2] in the text. In this case, we'll search for those 
    # and replace them as appropriate.
    if not xrefs:
        # If xrefs is empty, look for references formatted as [5]
        body_text = str(soup_copy)
        formatted_refs = re.findall(r'\[\d+\]', body_text)

        for formatted_ref in formatted_refs:
            ref_number = formatted_ref.strip('[]')
            if ref_number.isdigit() and int(ref_number) in references:
                replacement_text = f"({references[int(ref_number)]})"
                body_text = body_text.replace(formatted_ref, replacement_text)

        # Update the soup with the modified body text
        updated_soup = BeautifulSoup(body_text, 'html.parser')
    else:
        # Iterate through each xref tag
        for xref in xrefs:
            ref_text = xref.text.strip()

            expanded_refs = []

            # Split the reference text by commas
            parts = ref_text.split(',')

            for part in parts:
                part = part.strip()
                # Check if the part contains a range
                if '–' in part:
                    start_end = part.split('–')
                    if len(start_end) == 2 and start_end[0].strip().isdigit() and start_end[1].strip().isdigit():
                        start, end = map(int, start_end)
                        expanded_refs.extend([str(i) for i in range(start, end + 1)])
                    else:
                        # This is for rare cases where part = online supplementary file – database
                        expanded_refs.append(part)
                # Next check is reference number is stored between square brackets
                elif '[' in part and ']' in part:
                    ref_number_only = part.strip('[]')
                    expanded_refs.append(ref_number_only)
                else:
                    expanded_refs.append(part)
                    
            # Replace each reference number in the expanded list
            replacement_texts = []
            for ref_number in expanded_refs:
                if ref_number.isdigit() and int(ref_number) in references:
                    replacement_texts.append(f"({references[int(ref_number)]})")
                else:
                    replacement_texts.append(ref_number)

            # Replace the whole xref tag with the expanded references
            xref.replace_with(', '.join(replacement_texts))
        updated_soup = soup_copy
    return updated_soup

## Identify paragraphs with CCI version references

Parse the text into paragraphs, then identify which paragraphs include CCI version references.

In [0]:
# These functions are slightly modified from 
# pubmed_parser to work with XML data already
# loaded into the notebook from PMC

def stringify_children(node):
    """Joins all string parts excluding empty parts."""
    return "".join(text.strip() for text in node.itertext() if text)

def parse_article_meta(tree):
    """
    Parse PMID, PMC and DOI from given article tree
    """
    article_meta = tree.find(".//article-meta")
    if article_meta is not None:
        pmid_node = article_meta.find('article-id[@pub-id-type="pmid"]')
        pmc_node = article_meta.find('article-id[@pub-id-type="pmc"]')
        pub_id_node = article_meta.find('article-id[@pub-id-type="publisher-id"]')
        doi_node = article_meta.find('article-id[@pub-id-type="doi"]')
    else:
        pmid_node = None
        pmc_node = None
        pub_id_node = None
        doi_node = None

    pmid = pmid_node.text if pmid_node is not None else ""
    pmc = pmc_node.text if pmc_node is not None else ""
    pub_id = pub_id_node.text if pub_id_node is not None else ""
    doi = doi_node.text if doi_node is not None else ""

    dict_article_meta = {"pmid": pmid, "pmc": pmc, "doi": doi, "publisher_id": pub_id}

    return dict_article_meta

def parse_pubmed_paragraph(input_data):
    """
    Give path to a given PubMed OA file, parse and return
    a dictionary of all paragraphs, section that it belongs to,
    and a list of reference made in each paragraph as a list of PMIDs

    Parameters
    ----------
    input_data: str or bs4.element.Tag
        A string to an XML path or a BeautifulSoup Tag object.

    Return
    ------
    dict_pars: list
        A list contains dictionary for paragraph text and its metadata.
        Metadata includes 'pmc' of an article, 'pmid' of an article,
        'reference_ids' which is a list of reference ``rid`` made in a paragraph,
        'section' name of an article, and section 'text'
    """
    if isinstance(input_data, bytes):
        input_data = input_data.decode('utf-8')  # Decode bytes to string

    if isinstance(input_data, str) and input_data.strip().startswith('<'):
        # path is an XML string
        tree = etree.fromstring(input_data)
    elif isinstance(input_data, element.Tag):
        # input_data is a BeautifulSoup Tag object
        xml_str = str(input_data)
        # Remove XML encoding declaration
        xml_str = re.sub(r'<\?xml.*?\?>', '', xml_str)
        tree = etree.fromstring(xml_str)
    else:   
        # path is a path to an XML file
        tree = etree.parse(input_data)


    dict_article_meta = parse_article_meta(tree)
    pmid = dict_article_meta["pmid"]
    pmc = dict_article_meta["pmc"]
    paragraphs = tree.xpath("//body//p")
    dict_pars = list()
    
    for paragraph in paragraphs:
        paragraph_text = stringify_children(paragraph)
        section = paragraph.find("../title")
        if section is not None:
            section = stringify_children(section).strip()
        else:
            section = ""

        ref_ids = list()
        for xref in paragraph.xpath(".//xref[@ref-type='bibr']"):
            if "rid" in xref.attrib:
                ref_id = xref.attrib["rid"]
                ref_ids.append(ref_id)

        dict_par = {
            "pmc": pmc,
            "pmid": pmid,
            "section": section,
            "text": paragraph_text,
        }

        dict_pars.append(dict_par)

    return dict_pars

In [0]:
def filter_paragraphs(paragraph_dictionary: list[dict], charlson_terms: list[str]) -> list[dict]:
    """
    Filters paragraphs based on the presence of specified terms.

    This function iterates through a list of dictionaries, each representing a paragraph, and checks
    if any of the specified terms are present in the text of the paragraph. If a match is found, that
    paragraph is added to the list of matching paragraphs.

    PARAMETERS
    paragraph_dictionary : list of dict
        A list of dictionaries, each containing a 'text' field representing the paragraph text.
    charlson_terms : list of str
        A list of terms to search for within the paragraph text.

    RETURNS
    matching_paragraphs : list of dict
        A list of dictionaries containing paragraphs that match any of the specified terms.
    """

    # Initialize an empty list to store dictionaries with matching text
    matching_paragraphs = []

    # Iterate over each dictionary in the paragraph_dictionary list
    for paragraph in paragraph_dictionary:
        # Convert the text field to lowercase
        paragraph_text_lower = paragraph['text'].lower()

        # Check if any element of charlson_terms is part of the text field
        if any(ref.lower() in paragraph_text_lower for ref in charlson_terms):
            # If a match is found, add the dictionary to the matching_paragraphs list
            matching_paragraphs.append(paragraph)

    return matching_paragraphs

## Define functions and variables for LLM data extraction

In [0]:
# Set the model name and create a ChatCompletionsClient object
model_name = "Llama-3.3-70B-Instruct"

client = ChatCompletionsClient(
    endpoint=endpoint_url,
    credential=AzureKeyCredential(azure_key),
    model=model_name,
)

In [0]:
def query(question: dict[str,str],model_name: str) -> tuple[str, str, int]:
    """
    Queries the ChatCompletions API and returns the response

    Note that max_tokens has been deprecated and replaced with max_completion_tokens in
    the OpenAI Guide. This version from Azure is behind that version. 

    PARAMETERS
    question : input for messages to send to the API.
        dictionary using the format: {'role': 'content', 'content': 'your message'}
        or list using AssistantMessage, SystemMessage, and/or UserMessage
    model_name : string
        model name of the model to use, from Azure AI Foundry

    RETURNS
    question : input for messages to send to the API.
        returns the input question to assist with tracking
    response_content : string
        the content response from the API
    total_tokens : int
        the total number of tokens used (query + response)
    """
    response = client.complete(
        messages=question,
        max_tokens=512,
        temperature=0,
        top_p=0.1,
        presence_penalty=0.0,
        frequency_penalty=0.0,
        model=model_name
        )
    
    response_content = response.choices[0].message.content
    total_tokens = response.usage.total_tokens

    return(question,response_content,total_tokens)

In [0]:
# Define the Content/System Messages and the series of questions

system_message0 = """You are a literature review assistant. You will carefully review paragraphs and answer based on only what is present in the text. We are analyzing published literature to determine if it is clear which reference or references were used to calculate the Charlson Comorbidity Index (CCI) in this text. The text might include references that were not used to calculate the CCI, it might include just a single reference that was used, or it might contain multiple references that were used. """

q0 = """You are an advanced language model tasked with analyzing a provided paragraph from a published research article. Your goal is to determine whether the text explicitly states that the Charlson Comorbidity Index (CCI) was calculated in the study. Please follow these guidelines: 1. Read the paragraph carefully and identify any mention of the Charlson Comorbidity Index. 2. Look for keywords and phrases that indicate calculation or use of the CCI, such as "calculated", "assessed", "used", or similar terms. 3. If the paragraph suggests that the CCI was calculated or used in the analysis, respond with "Yes" 4. If the paragraph does not mention calculation or use of the CCI respond with "No" 5. Your response should be limited to "Yes" or "No" only, without any additional commentary or explanation. Please provide your response based on the analysis of the text provided. Don't  be strict. """

q1 = """Answer Yes or No only. Does the following text reference which paper or weights was used to calculate the Charlson Comorbidity Index (CCI)? """
q2 = """Answer Yes or No only. Does the following paragraph contain more than 1 reference to how the Charlson Comorbidity Index (CCI) was calculated? """

# The following is only asked if question 2 = NO
q3 = """Which reference was used to calculate the CCI? Return only the reference as Last Name Year. If it is not clear which reference was used to calculate the CCI, return NONE. """

# The following is only asked if question 2 = YES
system_message1 = """You are a literature review assistant. You will carefully review paragraphs and answer based on only what is present in the text. We are analyzing sentences from published literature to determine if it is clear which reference paper was used to calculate the Charlson Comorbidity Index (CCI) in this text. The following texts contain multiple references that might indicate how the Charlson Comorbidity Index (CCI) was calculated. We want to identify which reference or references were used to calculate the CCI in this paper. Sometimes these prompts will reference papers, but those were not used to calculate the CCI. Be strict.

Here is an example: Which reference or references were used to calculate the CCI in this paragraph? Return only the reference as Last Name and Year. If it is not clear what was implemented, return None. "In addition to DM, comorbidities defined by the Deyo's Charlson Comorbidity Index [Deyo 1992] were examined using a revised mapping algorithm cited by Quan et al. [Quan 2005]." The correct response is Deyo 1992 and Quan 2005

Here is an example:  Which reference or references were used to calculate the CCI in this paragraph? Return only the reference as Last Name and Year. If it is not clear what was implemented, return None. "The Charlson Comorbidity Index is sometimes used to measure somatic comorbidity in schizophrenic patients. The index includes 19 severe chronic disorders that are assigned a weighted score according to severity. The index was originally constructed to quantify the impact of comorbidity on mortality in a hospital setting among breast cancer patients (Charlson 1987) and later was adapted to ICD-10 diagnoses (Sundararajan 2004)." The correct response is Sundararajan 2004

Here is an example: Which reference or references were used to calculate the CCI in this paragraph? Return only the reference as Last Name and Year. If it is not clear what was implemented, return None. "We calculated the Charlson Comorbidity Index (Charlson 1987) using the (Quan 2005) implementation of ICD codes." The correct response is Quan 2005

Here is an example: Which reference or references were used to calculate the CCI in this paragraph? Return only the reference as Last Name and Year. If it is not clear what was implemented, return None. "For most applications, the CCI score is calculated by manual record review or using claims data, typically coded using the International Classification of Diseases, 9th Version (ICD-9).(Deyo 1992) (Romano 1993) (Quan 2005) The former approach is costly and the latter introduces biases due to coding errors, heterogeneous coding conventions, and the granularity of the coding system.16 In addition, previous research has suggested that manually extracting comorbidity information from medical records is superior to the use of ICD-9 codes17 and claims data are not available until after discharge time." The correct response is None """

q4 = """Which reference or references were used to calculate or assess the CCI in this paragraph? Return only the reference as Last Name and Year. If it is not clear what was implemented, return None. """

# Create list for system messages and questions for easy reference
system_message_list = [system_message0,system_message1]
question_list = [q0,q1,q2,q3,q4]

In [0]:
def analyze_paper(paper_text: str,system_message_list: list[str],question_list: list[str],model_name: str,verbose=False) -> tuple[list,int]:
    '''
    Analyzes a research paper by querying a series of questions and collecting responses
    to a LLM. Some questions can be skipped based on the response to initial questions. In
    that case, NaN values are returned for follow-up questions. 

    PARAMETERS:
    paper_text : str
        The text of the research paper to be analyzed.
    system_message_list : list
        A list of system messages to be used in the queries.
    question_list : list
        A list of questions to be asked about the paper.
    model_name : str
        The name of the model to be used for the queries, from Azure AI Foundry
    verbose : bool, optional
        If True, prints the number of tokens used. Defaults to False.

    RETURNS:
    tuple : A tuple containing:
        - result (list): A list of responses to the questions.
        - count_tokens (int): The total number of tokens used in the queries.
    '''
    result = []
    count_tokens = 0

    # Format and ask the zeroth question
    q0text = question_list[0] + paper_text
    input = [SystemMessage(content=system_message_list[0]),
             UserMessage(content=q0text)]
    question, response, tokens_used = query(input,model_name)
    count_tokens += tokens_used
    result.append(response)
    input.append(AssistantMessage(content=response))

    # Format and ask the first question
    q1text = question_list[1] + paper_text
    input = [SystemMessage(content=system_message_list[0]),
             UserMessage(content=q1text)]
    question, response, tokens_used = query(input,model_name)
    count_tokens += tokens_used
    result.append(response)
    input.append(AssistantMessage(content=response))

    # If the response to either the 0th or 1st question
    # is No, meaning there is not a reference,
    # we don't need to ask the other questions
    first_two_response_negative = any('no' in element.lower() for element in result)

    if first_two_response_negative:
        # Add NaN for other responses
        result.append(np.nan) # Question 2
        result.append(np.nan) # Question 3
        result.append(np.nan) # Question 4
    else:
        # Format and ask the second question
        q2text = question_list[2] + paper_text
        input.append(UserMessage(content=q2text))
        question, response, tokens_used = query(input,model_name)
        count_tokens += tokens_used
        result.append(response)
        input.append(AssistantMessage(content=response))

        # Q2 leads to a bifurcation
        if 'no' in response.strip().lower(): 
            # Format and ask the third question
            q3text = question_list[3] + paper_text
            input.append(UserMessage(content=q3text))
            question, response, tokens_used = query(input,model_name)
            count_tokens += tokens_used
            result.append(response)
            input.append(AssistantMessage(content=response))

            # Add NaN for question 4
            result.append(np.nan) # question 4
        else:
            # Add NaN for responses 3
            result.append(np.nan) #q3 response

            # Add a new system message for the multi-reference papers
            input.append(SystemMessage(content=system_message_list[1]))
            
            # Format and ask the fourth question
            q4text = question_list[4] + paper_text
            input.append(UserMessage(content=q4text))
            question, response, tokens_used = query(input,model_name)
            count_tokens += tokens_used
            result.append(response)
            input.append(AssistantMessage(content=response))

    if verbose:
        print("This paper required {} tokens".format(count_tokens))
    return result, count_tokens

## Full pipeline to pull paper from PMC and analyze it with the LLM

In [0]:
# Read in file of PMC IDs, then save PMC ID values to a list
pmc_id_df = pd.read_csv("YOUR_PATH_HERE",header=0)
pmc_id_list = pmc_id_df['PMCID'].tolist()

# Create a DataFrame to store responses
result_df = pd.DataFrame(columns=['pmc','paragraph','included_references','q0','q1','q2','q3','q4','tokens_used'])

for x in range(len(pmc_id_list)):
    try:
        # Select 1 value from the PMCID list
        query_id = pmc_id_list[x]

        soup = fetchPMCxml(query_id,entrez_key)

        # Find references
        references = find_charlson_references(soup,search_criteria)

        # Replace references in the body
        soup_up_ref = replace_references_in_body(soup, references)
        
        # Parse into paragraphs
        paragraph_dictionary = parse_pubmed_paragraph(soup_up_ref)

        # Get the list of dictionaries with matching text
        matching_paragraphs = filter_paragraphs(paragraph_dictionary, charlson_search_terms)

        # Some texts will not include references. If there are no references, we do not
        # need to analyze this article with the LLM
        # But we do need to save this information so we know that is the case.
        # One error check we can look for is when references are identified, but paragraphs not returned
        # If there are references, we'll save those 
        if not matching_paragraphs:
            if references.values():
                result = [np.nan] + [list(references.values())] + [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
            else:
                result = [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
            paragraph_df = pd.DataFrame([[query_id] + result],columns=['pmc','paragraph','included_references','q0','q1','q2','q3','q4','tokens_used'])
            result_df = pd.concat([result_df, paragraph_df], ignore_index=True)

        else:
            # For each matching paragraph, analyze it with the LLM
            for y in matching_paragraphs:
                pmcid = y['pmc']
                current_text = y['text']
                current_references = list(references.values())
                result, tokens_for_paragraph = analyze_paper(current_text,system_message_list,question_list,model_name,verbose=False)

                # Create Dataframe for this paragraph
                paragraph_df = pd.DataFrame([[pmcid] + [current_text] + [current_references] + result + [tokens_for_paragraph]],columns=['pmc','paragraph','included_references','q0','q1','q2','q3','q4','tokens_used'])
                
                # Append to the results dataframe
                result_df = pd.concat([result_df, paragraph_df], ignore_index=True)
    except Exception as e:
        print('Error:', e)
        result = ['ERROR','ERROR',np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
        paragraph_df = pd.DataFrame([[query_id] + result],columns=['pmc','paragraph','included_references','q0','q1','q2','q3','q4','tokens_used'])
        # Append to the results dataframe
        result_df = pd.concat([result_df, paragraph_df], ignore_index=True)

    x += 1

result_df.head()

In [0]:
# Save Results DataFrame
result_df.to_csv('YOUR_PATH_HERE')
