In [None]:
import os
import logging
from tqdm import tqdm
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer, util

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def ollama_llm(prompt):
    try:
        response = llm.invoke(prompt)
        return response.strip()
    except Exception as e:
        logging.error(f"Error in ollama_llm: {e}")
        return ""

def load_and_parse_pdf(pdf_path):
    try:
        logging.info(f"Processing PDF: {pdf_path}")
        loader = UnstructuredPDFLoader(pdf_path, mode="elements", strategy="hi_res")
        elements = loader.load()
        logging.info(f"Parsed {len(elements)} elements from the PDF.")
        return elements
    except Exception as e:
        logging.error(f"Error loading and parsing PDF: {e}")
        return []

def clean_text(text):
    text = text.replace('\n', ' ').strip()
    return text

def extract_content(elements):
    try:
        text_elements = [el for el in elements if el.metadata['category'] in ['NarrativeText', 'Title']]
        table_elements = [el for el in elements if el.metadata['category'] == 'Table']
        logging.info(f"Extracted {len(text_elements)} text elements and {len(table_elements)} table elements.")
        text_content = "\n\n".join([clean_text(el.page_content) for el in text_elements])
        table_content = "\n\n".join([clean_text(el.page_content) for el in table_elements])
        return text_content, table_content
    except Exception as e:
        logging.error(f"Error extracting content: {e}")
        return "", ""

def create_vectorstore(context):
    try:
        logging.info("Creating embeddings and vector store...")
        embeddings = OllamaEmbeddings(model='mxbai-embed-large')
        documents = [Document(page_content=context)]
        vectorstore = FAISS.from_documents(documents, embeddings)
        logging.info("Vector store created.")
        return vectorstore
    except Exception as e:
        logging.error(f"Error creating vector store: {e}")
        return None

def rerank_documents(query, documents, top_k=4):
    try:
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        query_embedding = model.encode(query, convert_to_tensor=True)
        doc_embeddings = model.encode([doc.page_content for doc in documents], convert_to_tensor=True)
        scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
        ranked_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
        return [doc for doc, score in ranked_docs[:top_k]]
    except Exception as e:
        logging.error(f"Error reranking documents: {e}")
        return []


## Prompt Setting

Adjust the prompt according to the model you choose.


In [None]:
def generate_final_answer(context):
    try:
        final_prompt = f"""
        You are an expert in enzyme kinetics. Based on the provided scientific context, extract detailed enzyme kinetics data.
        Ensure the data is accurate and strictly follows the specified format.
        Do not include any extraneous information. Provide the information in a clear and structured format as specified below.

        Context: {context}

        Required Format:
        Enzyme name: [Enzyme name]
        EC Number: [EC Number] OR N/A
        Organism: [Organism Name] OR N/A
        Substrate: [Substrate Name] OR N/A
        Type: [Wild-type OR Mutant (Specify Mutation)]
        Protein Identifier: [UniProt ID OR NCBI ID]
        Specific Activity: [Value] OR N/A
        KM Value: [Value in mM] OR N/A
        Kcat Value: [Value per second] OR N/A
        kcat/KM: [Value in mM^-1s^-1] OR N/A
        pI Value: [Value]
        pH Optimum: [Value]
        Temperature Optimum: [Value in Celsius]
        Molecular Weight: [Value in kDa]
        Reaction pH: [Value] OR N/A
        Reaction Temperature: [Value in Celsius] OR N/A
        Buffer Solution: [Buffer used in the assay]
        Activators: [substances that increase activity, or 'N/A']
        Inhibitors: [substances that decrease activity, or 'N/A']

        Provide only the requested information in the specified format. No additional text or explanations.
        """
        final_answer = ollama_llm(final_prompt)
        return final_answer
    except Exception as e:
        logging.error(f"Error generating final answer: {e}")
        return ""

def generate_query():
    return """
    Extract detailed enzyme kinetics data including enzyme name, EC number, organism, substrate, type (wild-type or mutant), protein identifier, specific activity, KM value, Kcat value, kcat/KM, pI value, pH optimum, temperature optimum, molecular weight, reaction pH, reaction temperature, buffer solution, activators, and inhibitors.
    """


In [None]:
def generate_answer(pdf_path):
    elements = load_and_parse_pdf(pdf_path)
    if not elements:
        return ""
    
    text_content, table_content = extract_content(elements)
    if not text_content and not table_content:
        return ""

    vectorstore = create_vectorstore(text_content + "\n\n" + table_content)
    if not vectorstore:
        return ""

    query = generate_query()
    docs = vectorstore.similarity_search(query, k=4) 
    reranked_docs = rerank_documents(query, docs, top_k=3)
    if not reranked_docs:
        return ""

    combined_context = "\n\n".join([doc.page_content for doc in reranked_docs])
    combined_context += "\n\n" + table_content

    logging.info("Generating final answer using LLM...")
    final_answer = generate_final_answer(combined_context)
    logging.info("Final answer generated.")
    
    return final_answer

def process_folder(folder_path, output_file, log_file, problematic_folder):
    if not os.path.exists(problematic_folder):
        os.makedirs(problematic_folder)
    
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            processed_files = f.read().splitlines()
    else:
        processed_files = []
    
    pdf_paths = [os.path.join(root, file) for root, _, files in os.walk(folder_path) for file in files if file.endswith(".pdf")]
    
    with tqdm(total=len(pdf_paths), unit="file") as pbar:
        for pdf_path in pdf_paths:
            if pdf_path not in processed_files:
                try:
                    logging.info(f"Processing file: {pdf_path}")
                    answer_text = generate_answer(pdf_path)
                    if answer_text:
                        pmid = os.path.basename(pdf_path).split('_')[0]
                        with open(output_file, "a") as f:
                            f.write(f"PMID: {pmid}\n{answer_text}\n\n")
                        with open(log_file, "a") as f:
                            f.write(f"{pdf_path}\n")
                    else:
                        raise ValueError("No answer generated")
                except Exception as e:
                    logging.error(f"Error processing {pdf_path}: {e}")
                    with open("error_log.txt", "a") as error_log:
                        error_log.write(f"Error processing {pdf_path}: {e}\n")
                    os.rename(pdf_path, os.path.join(problematic_folder, os.path.basename(pdf_path)))
                pbar.update(1)

## Set the model and folder

In [None]:
llm = Ollama(model="llama3")
folder_path = "path/to/pdf/folder/"
output_file = "output.txt"
log_file = "processed_log.txt"
problematic_folder = "problematic_files/"

process_folder(folder_path, output_file, log_file, problematic_folder)