In [14]:
import os
import nest_asyncio
import asyncio
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader

# Apply nest_asyncio
nest_asyncio.apply()

directory = "texts"

# Function to load PDF files
async def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)
    return pages

# Load all files
texts = {}
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    if filename.endswith(".docx"):
        loader = Docx2txtLoader(file_path)
        texts[filename] = loader.load()
    elif filename.endswith(".pdf"):
        texts[filename] = asyncio.run(load_pdf(file_path))

# Print loaded texts
for filename, text in texts.items():
    print(f"Content of {filename}:")
    print(text[:500])  # Print first 500 characters of each file
    print("\n" + "-"*80 + "\n")

Content of antibiotics-08-00023.pdf:
[Document(metadata={'source': 'texts\\antibiotics-08-00023.pdf', 'page': 0}, page_content='antibiotics \nArticle\nPrevalence of Antibiotic Resistance Genes in\nMultidrug-Resistant Enterobacteriaceae on\nPortuguese Livestock Manure\nPaula Amador 1, *\n , Ruben Fernandes 2, Cristina Prudêncio 2 and Isabel Duarte 1\n1 Environment Department, Research Centre for Natural Resources, Environment and Society (CERNAS),\nCollege of Agriculture, Polytechnic of Coimbra, 3045-601 Coimbra, Portugal; iduarte@esac.pt\n2 Department Chemical Sciences and Biomolecules, School Allied Health Sciences, Polytechnic of Porto,\n4200-072 Porto, Portugal; rfernandes@ess.ipp.pt (R.F.); cprudencio@estsp.ipp.pt (C.P .)\n* Correspondence: paula_amador@esac.pt; Tel.: +351-129-802-940\nReceived: 8 January 2019; Accepted: 23 February 2019; Published: 13 March 2019\n/gid00030/gid00035/gid00032/gid00030/gid00038/gid00001/gid00033/gid00042/gid00045/gid00001\n/gid00048/gid00043/gid00031

In [15]:
# Combine page content and clean up the texts dictionary
for filename, documents in texts.items():
    combined_text = " ".join([doc.page_content for doc in documents])
    texts[filename] = combined_text


for filename in texts.keys():
    print(filename)

antibiotics-08-00023.pdf
IMPART, plate validation, 2022.pdf
Irrgang, GES-5, 2020.pdf
Irrgang, VIM_2019.pdf
Pauly, 19-AB01133.pdf
Pauly, Carba plate validation, 2020.pdf
Pauly,method comparison, 2021.pdf
Roschanski, VIM-1 Stall N2, 2019.pdf


In [2]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [34]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o", temperature=0)

In [16]:
## Inclusion / Exclusion

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field

# Define your desired data structure.
class selection(BaseModel):
    isolation: str = Field(description="Answer the first question with either 'yes' or 'no'")
    method: str = Field(description="Answer the second question with either 'yes' or 'no'")
    study_type: str = Field(description="Answer the third question with either 'yes' or 'no'")

# Set up the parser
parser = JsonOutputParser(pydantic_object=selection)

# Create the prompt template
prompt = PromptTemplate(
    template="""
    Read the following study and answer these questions in JSON format:
    1. Does this study explicitly isolate strains of carbapenemase-producing bacteria?
    2. Does this study state the bacteriological method used for the isolation of the carbapenemase-producing bacteria?
    3. Does this study fall in one or multiple of the following categories: protocol validation, protocol development, method validation, method comparison, plate validation?
    {format_instructions}
    Text:{text}\n""",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Combine the prompt with the model and parser
chain = prompt | llm | parser

from langchain_community.callbacks import get_openai_callback

def process_texts(texts):
    results = {}
    with get_openai_callback() as cb:
        for filename, text in texts.items():
            result = chain.invoke({"text": text})
            results[filename] = result
        print(f"Total tokens used: {cb.total_tokens}")
        print(f"Prompt tokens: {cb.prompt_tokens}")
        print(f"Completion tokens: {cb.completion_tokens}")
        print(f"Total cost (USD): {cb.total_cost}")
    return results

# Run the function on the texts dictionary
extraction_selection = process_texts(texts)
extraction_selection

Total tokens used: 127612
Prompt tokens: 127372
Completion tokens: 240
Total cost (USD): 0.32083


{'antibiotics-08-00023.pdf': {'isolation': 'no',
  'method': 'yes',
  'study_type': 'no'},
 'IMPART, plate validation, 2022.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'yes'},
 'Irrgang, GES-5, 2020.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'},
 'Irrgang, VIM_2019.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'},
 'Pauly, 19-AB01133.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'},
 'Pauly, Carba plate validation, 2020.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'yes'},
 'Pauly,method comparison, 2021.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'yes'},
 'Roschanski, VIM-1 Stall N2, 2019.pdf': {'isolation': 'yes',
  'method': 'yes',
  'study_type': 'no'}}

In [None]:
def categorize_files(extraction_selection):
    accepted = []
    rejected = []
    for filename, response in extraction_selection.items():
        if (response['isolation'] == 'yes' and response['method'] == 'yes') or response['study_type'] == 'yes':
            accepted.append(filename)
        else:
            rejected.append(filename)
    return accepted, rejected

accepted, rejected = categorize_files(extraction_selection)

print("Accepted files:", accepted)
print("Rejected files:", rejected)

Accepted files: ['IMPART, plate validation, 2022.pdf', 'Irrgang, GES-5, 2020.pdf', 'Irrgang, VIM_2019.pdf', 'Pauly, 19-AB01133.pdf', 'Pauly, Carba plate validation, 2020.pdf', 'Pauly,method comparison, 2021.pdf', 'Roschanski, VIM-1 Stall N2, 2019.pdf']
Rejected files: ['antibiotics-08-00023.pdf']


In [29]:
# Create a dictionary with filename and acceptance status
acceptance_status = {filename: 'accepted' for filename in accepted}
acceptance_status.update({filename: 'rejected' for filename in rejected})

# Print the dictionary
print(acceptance_status)

{'IMPART, plate validation, 2022.pdf': 'accepted', 'Irrgang, GES-5, 2020.pdf': 'accepted', 'Irrgang, VIM_2019.pdf': 'accepted', 'Pauly, 19-AB01133.pdf': 'accepted', 'Pauly, Carba plate validation, 2020.pdf': 'accepted', 'Pauly,method comparison, 2021.pdf': 'accepted', 'Roschanski, VIM-1 Stall N2, 2019.pdf': 'accepted', 'antibiotics-08-00023.pdf': 'rejected'}


In [30]:
# Compare ground_truth_rejected with rejected and count matches

ground_truth_rejected = ['antibiotics-08-00023.pdf']

matches = set(ground_truth_rejected) & set(rejected)
match_count = len(matches)
total_count = len(ground_truth_rejected)

print(f"Matches: {matches}")
print(f"Number of matches: {match_count} of {total_count}")

Matches: {'antibiotics-08-00023.pdf'}
Number of matches: 1 of 1


In [21]:
from typing import List, Optional
from pydantic import BaseModel, Field

class CPE(BaseModel):
    """Information about studies on carbapenemase-producing bacteria"""
    country: Optional[str] = Field(default=None, description="Origin of samples.")
    type_of_study: Optional[str] = Field(default=None, description="Classify the study into: sample investigation, protocol development, method comparison, method validation. Multiple labels possible separated by comma.") #o3
    sample_type: Optional[str] = Field(default=None, description="Which sample types were used in the study? Pick from 'livestock' (eg. fecal samples, feces, caecum, manure, cattle, pigs, poultry, etc.), 'livestock environment' (eg. farm or barn environment, etc.), 'companion animal' (eg. dog, cat, etc.), 'food' (eg. meat), 'environment' (eg. water, soil, etc.) or 'aquaculture'. Multiple answers possible separated by comma.") #ok, 1 Fehler bei Pauly
    matrix_description: Optional[str] = Field(default=None, description="Matrix description of study. Which animal and sample material was used?") #ok, 1 Fehler bei Pauly
    bacterial_species: Optional[str] = Field(default=None, description="Species of bacteria. Multiple answers possible separated by comma.") #fertig, o1
    genes: Optional[str] = Field(default=None, description="Genes or carbapenemase detected in this study.") # potentially picklist, otherwise o3
    protocol_reference: Optional[str] = Field(default=None, description="Is there a reference for the protocol used? Eg. EURL-AR, DIN EN ISO, etc. Yes or No. Provide reference, if possible.") #fertig, o1
    isolation_procedure: Optional[str] = Field(default=None, description="Is the method for the isolation of carbapenemase-producing bacteria culture-based, non-culture-based or both?") #fertig, o1
    method_validation: Optional[str] = Field(default=None, description="Was the method for the isolation of carbapenemase-producing bacteria validated? Yes or no. Provide values for sensitivity and specificity.") #neu, testen
    isolate_characterization: Optional[str] = Field(default=None, description="Which methods (eg. PFGE, NGS, etc.) were used for the characterization of carbapenemase-producing bacteria? Do not include information on species identification (MALDI-ToF). Multiple answers possible separated by comma.") #fertig, o1
    MLST_isolates: Optional[str] = Field(default=None, description="What was the MLST (Multi Locus Sequence Typing) of the isolates?") #fertig, 4o
    gene_localization: Optional[str] = Field(default=None, description="Was the characterization of isolates for carbapenemase-producing bacteria based on a gene located on a plasmid and / or on the chromosome? Multiple answers possible separated by comma.") # 2 Fehler.: schauen on es nach beschränkung des textes besser wird.
    plasmid_type: Optional[str] = Field(default=None, description="If plasmid was detected, on which type of plasmid (Inc group) was the gene located?") #fertig, o1    
    plasmid_size: Optional[str] = Field(default=None, description="If plasmid was detected, what was the size (bp or kbp) of the plasmid?") #fertig, o1
    plasmid_transferable: Optional[str] = Field(default=None, description="If plasmid was detected, was the CPE-gene harbouring plasmid transferable by conjugation and / or transformation? Multiple answers possible separated by comma.") #fertig, o1
    LOD_culture_based: Optional[str] = Field(default=None, description="Was there a LOD (limit of detection) for the isolation method provided?") #fertig, 4o
    confirmation_carbapenemase: Optional[str] = Field(default=None, description="How was carbapenemase-production confirmed (eg. PCR, real time PCR, MIC, etc.)? Do not include species confirmation (MALDI-ToF). Do not include characterization of the isolates.") #müssen wir auf ergebnis warten  
    sample_dilution: Optional[str] = Field(default=None, description="Dilution of the sample for the isolation method (eg. 1:10). Do not include dilution of the enrichment.") #fertig, o1
    sample_weight: Optional[str] = Field(default=None, description="Weight of samples. If multiple samples exist, separate weights by comma.") #fertig, o1
    first_enrichtment: Optional[str] = Field(default=None, description="Was the first enrichment or preenrichment selective or non selective? If selective, which antimicrobial was used with which concentration?") #fertig, 4o
    first_enrichtment_condition: Optional[str] = Field(default=None, description="Which condition of temperature and time for first enrichment or preenrichment?") #fertig, 4o
    second_enrichment: Optional[str] = Field(default=None, description="Was an other enrichment carried out? Yes or No. If you can not find the information it is 'no'.") #fertig, 4o
    second_enrichtment_selective: Optional[str] = Field(default=None, description="Was the second enrichment selective or non selective? If selective: which antimicrobial was used with which concentration?") #fertig, 4o
    second_enrichtment_condition: Optional[str] = Field(default=None, description="Which condition of temperature and time for second enrichment?") #fertig, 4o
    selective_plate: Optional[str] = Field(default=None, description="Which selective agars where used? Provide name of antimicrobial used at which concentration for each agar plate. Multiple answers possible separated by comma.") #fertig, 4o; concentration fehlt zum teil; separieren in name of agar plate; name of antimicrobial & concentration
    ncb_method: Optional[str] = Field(default=None, description="Which non-culture-based method (eg. PCR, real time PCR, NGS, WGS etc.) for the isolation of carbapenemase-producing bacteria was used?") # Testen! Evtl mit regel arbeiten; o1 war besser
    which_step: Optional[str] = Field(default=None, description="At which step was the non-culture-based method applied? After first enrichment, after second enrichment, post-isolation.") #Neu, testen, o1


In [32]:
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# Set up the parser
parser = JsonOutputParser(pydantic_object=CPE)

prompt = PromptTemplate(
    template=(
        "You are an expert extraction algorithm. "
        "Only extract relevant information from the text. "
        "If you do not know the value of an attribute asked to extract, "
        "return null for the attribute's value."
        "{format_instructions}\n\n"
        "Text: {text}\n"
    ),
    input_variables=["text"],
    partial_variables={
        "format_instructions": parser.get_format_instructions()
    },
)

# Combine the prompt with the model and parser
chain = prompt | llm | parser

from langchain_community.callbacks import get_openai_callback

def process_texts(texts):
    results = {}
    with get_openai_callback() as cb:
        for filename, text in texts.items():
            result = chain.invoke({"text": text})
            results[filename] = result
        print(f"Total tokens used: {cb.total_tokens}")
        print(f"Prompt tokens: {cb.prompt_tokens}")
        print(f"Completion tokens: {cb.completion_tokens}")
        print(f"Total cost (USD): {cb.total_cost}")
    return results

# Run the function on the texts dictionary
extraction_cpe = process_texts(texts)

Total tokens used: 143119
Prompt tokens: 140238
Completion tokens: 2881
Total cost (USD): 0.379405


In [33]:
import pandas as pd

# Convert the extraction_cpe_o1 dictionary to a list of dictionaries suitable for DataFrame creation
data = [item for item in extraction_cpe.values()]
index = list(extraction_cpe.keys())

# Create the DataFrame
df_extraction = pd.DataFrame(data, index=index)

# Add filename and acceptance_status as the first and second columns
df_extraction.insert(0, 'filename', df_extraction.index)
df_extraction.insert(1, 'acceptance_status', df_extraction['filename'].map(acceptance_status))


df_extraction

Unnamed: 0,filename,acceptance_status,country,type_of_study,sample_type,matrix_description,bacterial_species,genes,protocol_reference,isolation_procedure,...,sample_dilution,sample_weight,first_enrichtment,first_enrichtment_condition,second_enrichment,second_enrichtment_selective,second_enrichtment_condition,selective_plate,ncb_method,which_step
antibiotics-08-00023.pdf,antibiotics-08-00023.pdf,rejected,Portugal,sample investigation,livestock,"livestock manure from poultry, pig, dairy farm...","Enterobacteriaceae, E. coli, Citrobacter freun...","catI, catII, qnrS, qnrB, oqxB, tet(A), tet(M),...",,,...,,,,,,,,,,
"IMPART, plate validation, 2022.pdf","IMPART, plate validation, 2022.pdf",accepted,Europe,method comparison,"livestock, food",spiked pig caecal and turkey meat samples,"Enterobacteriaceae, Salmonella, Escherichia co...","blaIMP, blaOXA-48, blaVIM-1, blaKPC-2, blaNDM-1",EURL-AR,culture-based,...,1:10,,non selective,37°C overnight,No,,,"Brilliance™ CRE Agar, CHROMID® CARBA, CHROMaga...",,
"Irrgang, GES-5, 2020.pdf","Irrgang, GES-5, 2020.pdf",accepted,Germany,sample investigation,livestock,fecal sample of a fattening pig,Escherichia coli,"blaGES-1, blaGES-5, blaGES-5B",EURL-AR,culture-based,...,,,selective,,no,,,,,
"Irrgang, VIM_2019.pdf","Irrgang, VIM_2019.pdf",accepted,Germany,"sample investigation, method comparison","livestock, livestock environment","fecal samples from pigs, barn surrounding envi...",Escherichia coli,blaVIM−1,EURL-AR,culture-based,...,1:10,10 g,non selective,16–20 h at 37°C,Yes,selective with 1 mg/L cefotaxime,"37°C, 44°C","chromID Carba, MacConkey agar with 1 mg/L CTX",real-time PCR,post-isolation
"Pauly, 19-AB01133.pdf","Pauly, 19-AB01133.pdf",accepted,Germany,sample investigation,"food, livestock",pork shoulder from a pig raised in Germany,Escherichia coli,"blaVIM-1, blaSHV-5, blaCMY-13",EURL-AR,culture-based,...,,,selective,37 ±2 °C for 16–18 h,Yes,selective,,McConkey agar supplemented with 0.125 mg/L MEM...,,
"Pauly, Carba plate validation, 2020.pdf","Pauly, Carba plate validation, 2020.pdf",accepted,Germany,method validation,"livestock, food","cecum content, meat","Enterobacteriaceae, Escherichia coli, Salmonel...","blaKPC, blaVIM, blaNDM, blaIMP",EURL-AR,culture-based,...,,,,,No,,,"ChromID CARBA, MacConkey agar with cefotaxime ...",,
"Pauly,method comparison, 2021.pdf","Pauly,method comparison, 2021.pdf",accepted,Germany,"method comparison, method validation",livestock,pig caecum,"E. coli, Salmonella enterica subsp. enterica","blaVIM-1, blaGES-5, blaKPC-2, blaNDM-1, blaOXA-48",EURL-AR,culture-based,...,1:10,9 g,non selective,37 ± 2 °C for 16–18 h,Yes,"selective, LB + CTX, LB + MEM",37 ± 2 °C for 16–18 h under microaerophilic co...,MacConkey Agar supplemented with 0.125 mg/L me...,real time PCR,post-isolation
"Roschanski, VIM-1 Stall N2, 2019.pdf","Roschanski, VIM-1 Stall N2, 2019.pdf",accepted,Germany,sample investigation,"livestock, livestock environment","Breeding pig farm, fecal and environmental sam...","Enterobacter cloacae, Salmonella enterica Sero...",blaVIM-1,DIN EN ISO 6579,culture-based,...,,,non selective,37°C overnight,No,,,"ChromID Carba, MacConkey with cefotaxime",,
