In [None]:
!pip install --upgrade --quiet langchain langchain-openai

In [1]:
import os
import nest_asyncio
import asyncio
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader

# Apply nest_asyncio
nest_asyncio.apply()

directory = "texts"

# Function to load PDF files
async def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)
    return pages

# Load all files
texts = {}
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    if filename.endswith(".docx"):
        loader = Docx2txtLoader(file_path)
        texts[filename] = loader.load()
    elif filename.endswith(".pdf"):
        texts[filename] = asyncio.run(load_pdf(file_path))

In [2]:
# Combine page content and clean up the texts dictionary
for filename, documents in texts.items():
    combined_text = " ".join([doc.page_content for doc in documents])
    texts[filename] = combined_text


for filename in texts.keys():
    print(filename)

antibiotics-08-00023.pdf
IMPART, plate validation, 2022.pdf
Irrgang, GES-5, 2020.pdf
Irrgang, VIM_2019.pdf
Pauly, 19-AB01133.pdf
Pauly, Carba plate validation, 2020.pdf
Pauly,method comparison, 2021.pdf
Roschanski, VIM-1 Stall N2, 2019.pdf


In [3]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [4]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o", temperature=0)
llm_o3 = ChatOpenAI(model="o3-mini")

In [5]:
## Inclusion / Exclusion

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field

# Define your desired data structure.
class selection(BaseModel):
    isolation: str = Field(description="Answer the first question with either 'yes' or 'no'")
    method: str = Field(description="Answer the second question with either 'yes' or 'no'")
    study_type: str = Field(description="Answer the third question with either 'yes' or 'no'")

# Set up the parser
parser = JsonOutputParser(pydantic_object=selection)

# Create the prompt template
prompt = PromptTemplate(
    template="""
    Read the following study and answer these questions in JSON format:
    1. Does this study explicitly isolate strains of carbapenemase-producing bacteria?
    2. Does this study state the bacteriological method used for the isolation of the carbapenemase-producing bacteria?
    3. Does this study fall in one or multiple of the following categories: protocol validation, protocol development, method validation, method comparison, plate validation?
    {format_instructions}
    Text:{text}\n""",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Combine the prompt with the model and parser
chain = prompt | llm | parser

from langchain_community.callbacks import get_openai_callback

def process_texts(texts):
    results = {}
    with get_openai_callback() as cb:
        for filename, text in texts.items():
            result = chain.invoke({"text": text})
            results[filename] = result
        print(f"Total tokens used: {cb.total_tokens}")
        print(f"Prompt tokens: {cb.prompt_tokens}")
        print(f"Completion tokens: {cb.completion_tokens}")
        print(f"Total cost (USD): {cb.total_cost}")
    return results

# Run the function on the texts dictionary
extraction_selection = process_texts(texts)
extraction_selection

Total tokens used: 127613
Prompt tokens: 127372
Completion tokens: 241
Total cost (USD): 0.32084


{'antibiotics-08-00023.pdf': {'isolation': 'no',
  'method': 'yes',
  'study_type': 'no'},
 'IMPART, plate validation, 2022.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'yes'},
 'Irrgang, GES-5, 2020.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'},
 'Irrgang, VIM_2019.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'},
 'Pauly, 19-AB01133.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'},
 'Pauly, Carba plate validation, 2020.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'yes'},
 'Pauly,method comparison, 2021.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'method comparison'},
 'Roschanski, VIM-1 Stall N2, 2019.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'}}

In [6]:
def categorize_files(extraction_selection):
    accepted = []
    rejected = []
    for filename, response in extraction_selection.items():
        if (response['isolation'] == 'yes' and response['method'] == 'yes') or response['study_type'] == 'yes':
            accepted.append(filename)
        else:
            rejected.append(filename)
    return accepted, rejected

accepted, rejected = categorize_files(extraction_selection)

print("Accepted files:", accepted)
print("Rejected files:", rejected)

Accepted files: ['IMPART, plate validation, 2022.pdf', 'Irrgang, GES-5, 2020.pdf', 'Irrgang, VIM_2019.pdf', 'Pauly, 19-AB01133.pdf', 'Pauly, Carba plate validation, 2020.pdf', 'Pauly,method comparison, 2021.pdf', 'Roschanski, VIM-1 Stall N2, 2019.pdf']
Rejected files: ['antibiotics-08-00023.pdf']


In [7]:
# Create a dictionary with filename and acceptance status
acceptance_status = {filename: 'accepted' for filename in accepted}
acceptance_status.update({filename: 'rejected' for filename in rejected})

# Print the dictionary
print(acceptance_status)

{'IMPART, plate validation, 2022.pdf': 'accepted', 'Irrgang, GES-5, 2020.pdf': 'accepted', 'Irrgang, VIM_2019.pdf': 'accepted', 'Pauly, 19-AB01133.pdf': 'accepted', 'Pauly, Carba plate validation, 2020.pdf': 'accepted', 'Pauly,method comparison, 2021.pdf': 'accepted', 'Roschanski, VIM-1 Stall N2, 2019.pdf': 'accepted', 'antibiotics-08-00023.pdf': 'rejected'}


In [8]:
# Compare ground_truth_rejected with rejected and count matches

ground_truth_rejected = ['antibiotics-08-00023.pdf']

matches = set(ground_truth_rejected) & set(rejected)
match_count = len(matches)
total_count = len(ground_truth_rejected)

print(f"Matches: {matches}")
print(f"Number of matches: {match_count} of {total_count}")

Matches: {'antibiotics-08-00023.pdf'}
Number of matches: 1 of 1


In [None]:
from typing import List, Optional
from pydantic import BaseModel, Field

class CPE(BaseModel):
    """Information about studies on carbapenemase-producing bacteria"""
    country: Optional[str] = Field(default=None, description="Origin of samples.")
    type_of_study: Optional[str] = Field(default=None, description="Classify the study into: sample investigation, protocol development, method comparison, method validation. Multiple labels possible separated by comma.")
    sample_type: Optional[str] = Field(default=None, description="Which sample types were used in the study? Pick from 'livestock' (eg. fecal samples, feces, caecum, manure, cattle, pigs, poultry, etc.), 'livestock environment' (eg. farm or barn environment, etc.), 'companion animal' (eg. dog, cat, etc.), 'food' (eg. meat), 'environment' (eg. water, soil, etc.) or 'aquaculture'. Multiple answers possible separated by comma.")
    matrix_description: Optional[str] = Field(default=None, description="Matrix description of study. Which animal and sample material was used?")
    bacterial_species: Optional[str] = Field(default=None, description="Species of bacteria. Multiple answers possible separated by comma.")
    genes: Optional[str] = Field(default=None, description="Genes or carbapenemase detected in this study.")
    protocol_reference: Optional[str] = Field(default=None, description="Is there a reference for the protocol used? Eg. EURL-AR, DIN EN ISO, etc. Yes or No. Provide reference, if possible.")
    isolation_procedure: Optional[str] = Field(default=None, description="Is the method for the isolation of carbapenemase-producing bacteria culture-based, non-culture-based or both?")
    method_validation: Optional[str] = Field(default=None, description="Was the method for the isolation of carbapenemase-producing bacteria validated? Yes or no. Provide values for sensitivity and specificity.")
    isolate_characterization: Optional[str] = Field(default=None, description="Which methods (eg. PFGE, NGS, etc.) were used for the characterization of carbapenemase-producing bacteria? Do not include information on species identification (MALDI-ToF). Multiple answers possible separated by comma.")
    MLST_isolates: Optional[str] = Field(default=None, description="What was the MLST (Multi Locus Sequence Typing) of the isolates?")
    gene_localization: Optional[str] = Field(default=None, description="Was the characterization of isolates for carbapenemase-producing bacteria based on a gene located on a plasmid and / or on the chromosome? Multiple answers possible separated by comma.")
    plasmid_type: Optional[str] = Field(default=None, description="If plasmid was detected, on which type of plasmid (Inc group) was the gene located?")   
    plasmid_size: Optional[str] = Field(default=None, description="If plasmid was detected, what was the size (bp or kbp) of the plasmid?")
    plasmid_transferable: Optional[str] = Field(default=None, description="If plasmid was detected, was the CPE-gene harbouring plasmid transferable by conjugation and / or transformation? Multiple answers possible separated by comma.")
    LOD_culture_based: Optional[str] = Field(default=None, description="Was there a LOD (limit of detection) for the isolation method provided?")
    confirmation_carbapenemase: Optional[str] = Field(default=None, description="How was carbapenemase-production confirmed (eg. PCR, real time PCR, MIC, etc.)? Do not include species confirmation (MALDI-ToF). Do not include characterization of the isolates.")  
    sample_dilution: Optional[str] = Field(default=None, description="Dilution of the sample for the isolation method (eg. 1:10). Do not include dilution of the enrichment.")
    sample_weight: Optional[str] = Field(default=None, description="Weight of samples. If multiple samples exist, separate weights by comma.")
    first_enrichtment: Optional[str] = Field(default=None, description="Was the first enrichment or preenrichment selective or non selective? If selective, which antimicrobial was used with which concentration?")
    first_enrichtment_condition: Optional[str] = Field(default=None, description="Which condition of temperature and time for first enrichment or preenrichment?")
    second_enrichment: Optional[str] = Field(default=None, description="Was an other enrichment carried out? Yes or No. If you can not find the information it is 'no'.")
    second_enrichtment_selective: Optional[str] = Field(default=None, description="Was the second enrichment selective or non selective? If selective: which antimicrobial was used with which concentration?")
    second_enrichtment_condition: Optional[str] = Field(default=None, description="Which condition of temperature and time for second enrichment?")
    selective_plate: Optional[str] = Field(default=None, description="Which selective agars where used? Provide name of antimicrobial used at which concentration for each agar plate. Multiple answers possible separated by comma.")
    ncb_method: Optional[str] = Field(default=None, description="Which non-culture-based method (eg. PCR, real time PCR, NGS, WGS etc.) for the isolation of carbapenemase-producing bacteria was used?") 
    which_step: Optional[str] = Field(default=None, description="At which step was the non-culture-based method applied? After first enrichment, after second enrichment, post-isolation.")


In [10]:
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# Set up the parser
parser = JsonOutputParser(pydantic_object=CPE)

prompt = PromptTemplate(
    template=(
        "You are an expert extraction algorithm. "
        "Only extract relevant information from the text. "
        "If you do not know the value of an attribute asked to extract, "
        "return null for the attribute's value."
        "{format_instructions}\n\n"
        "Text: {text}\n"
    ),
    input_variables=["text"],
    partial_variables={
        "format_instructions": parser.get_format_instructions()
    },
)

# Combine the prompt with the model and parser
chain = prompt | llm_o3 | parser

from langchain_community.callbacks import get_openai_callback

def process_texts(texts):
    results = {}
    with get_openai_callback() as cb:
        for filename, text in texts.items():
            result = chain.invoke({"text": text})
            results[filename] = result
        print(f"Total tokens used: {cb.total_tokens}")
        print(f"Prompt tokens: {cb.prompt_tokens}")
        print(f"Completion tokens: {cb.completion_tokens}")
        print(f"Total cost (USD): {cb.total_cost}")
    return results

# Run the function on the texts dictionary
extraction_cpe = process_texts(texts)

Total tokens used: 166028
Prompt tokens: 140230
Completion tokens: 25798
Total cost (USD): 0.0


In [11]:
import pandas as pd

# Convert the extraction_cpe_o1 dictionary to a list of dictionaries suitable for DataFrame creation
data = [item for item in extraction_cpe.values()]
index = list(extraction_cpe.keys())

# Create the DataFrame
df_extraction = pd.DataFrame(data, index=index)

# Add filename and acceptance_status as the first and second columns
df_extraction.insert(0, 'filename', df_extraction.index)
df_extraction.insert(1, 'acceptance_status', df_extraction['filename'].map(acceptance_status))


df_extraction

Unnamed: 0,filename,acceptance_status,country,type_of_study,sample_type,matrix_description,bacterial_species,genes,protocol_reference,isolation_procedure,...,sample_dilution,sample_weight,first_enrichtment,first_enrichtment_condition,second_enrichment,second_enrichtment_selective,second_enrichtment_condition,selective_plate,ncb_method,which_step
antibiotics-08-00023.pdf,antibiotics-08-00023.pdf,rejected,Portugal,Prevalence study of multidrug‐resistant Entero...,"Livestock manure, slurry, and slaughterhouse w...","Samples collected from dairy cattle, pig, and ...","Enterobacteriaceae including Escherichia coli,...",Multiple resistance genes were screened includ...,Protocols referenced include those described i...,Decimal dilutions prepared in sterile 0.9% NaC...,...,Decimal dilutions in sterile 0.9% NaCl,,,,no,,,VRBG (Violet Red Bile Glucose) selective agar,Multiplex PCR assays for screening of resistan...,Non‐culture based detection applied post-isola...
"IMPART, plate validation, 2022.pdf","IMPART, plate validation, 2022.pdf",accepted,"France, Norway, Sweden, Denmark, Germany, Neth...","multicenter evaluation study, method comparison",livestock (pig caecal content) and food (SPF t...,Pig caecal samples from pigs and meat samples ...,"Escherichia coli, Klebsiella pneumoniae, Salmo...","blaIMP, blaOXA-48, blaVIM-1, blaNDM-1, blaKPC-...",EURL-AR protocol (EU Decision 2013/652/EU),culture-based isolation after non‐selective pr...,...,1:10 dilution in buffered peptone water,,non-selective pre-enrichment in buffered pepto...,overnight incubation at 37°C,no,,,"Brilliance™ CRE Agar, CHROMID® CARBA Agar, CHR...",PCR and real-time PCR,post-isolation
"Irrgang, GES-5, 2020.pdf","Irrgang, GES-5, 2020.pdf",accepted,Germany,sample investigation,livestock,fecal sample from a fattening pig,Escherichia coli,"blaGES-1, blaGES-5, blaGES-5B",EURL-AR,culture-based,...,,,,,no,,,,,
"Irrgang, VIM_2019.pdf","Irrgang, VIM_2019.pdf",accepted,Germany,"sample investigation, protocol development","livestock, livestock environment","Pig fecal samples (composite fecal material), ...",Escherichia coli,blaVIM-1,EURL-AR,Modified culture-dependent method combining un...,...,1:10,10 g,non-selective pre-enrichment,16-20 h at 37°C,yes,selective,24 h at 44°C,"chromID R-Carba, MacConkey agar with CTX",real-time PCR,applied on the enrichment broth
"Pauly, 19-AB01133.pdf","Pauly, 19-AB01133.pdf",accepted,Germany,sample investigation,food,Pork shoulder meat sample from a fattened pig,Escherichia coli,"blaVIM-1, blaSHV-5, blaCMY-13",European Union Reference Laboratory for Antimi...,culture-based,...,,,,,yes,selective with CTX and MEM,16–18 h at 37±2°C (plate incubation),"McConkey agar with 0.125 mg/L meropenem, McCon...",,
"Pauly, Carba plate validation, 2020.pdf","Pauly, Carba plate validation, 2020.pdf",accepted,Germany,"method comparison, method validation","livestock, food",cecum content and meat,"Escherichia coli, Salmonella, Vibrio, Klebsiella","blaVIM-1, blaNDM-1, blaKPC-2",EURL-AR protocol,culture-based,...,"1:10,000",,,,,,,"ChromID R⃝CARBA agar, MacConkey agar with 1 mg...",,
"Pauly,method comparison, 2021.pdf","Pauly,method comparison, 2021.pdf",accepted,Germany,"protocol development, method comparison",livestock,Pig caeca,Escherichia coli,"blaVIM-1, blaGES-5, blaKPC-2, blaNDM-1, blaOXA-48","EURL-AR protocol for isolation of ESBL, AmpC a...",culture-based,...,1:10,9 g,non-selective,37°C for 16–18 hours,yes,selective using LB+CTX and LB+MEM,37±2°C for 16–18 hours under microaerophilic c...,"ChromID® CARBA SMART agar, McC + CTX+MEM, McC ...",Real-time PCR,After enrichment steps (both first and second ...
"Roschanski, VIM-1 Stall N2, 2019.pdf","Roschanski, VIM-1 Stall N2, 2019.pdf",accepted,Germany,sample investigation,"fecal, environmental, boot swabs, manure, dust","Pig farm; samples from feces, boot swabs, manu...","Enterobacter cloacae, Salmonella enterica sero...",blaVIM-1,Modified DIN EN ISO 6579 protocol; EURL-AR gui...,Culture-based isolation using selective preenr...,...,,"20 g (feces), 5 g (manure)",non selective preenrichment,overnight incubation at 37°C,no,,,"ChromID Carba, MacConkey agar with cefotaxime",Real-time PCR,Applied to preenrichment cultures
