## Failed Attempt

I have tried various methods in this notebook, but unfortunately, they did not work as expected. If you want to use this notebook, you may need to download additional packages such as pydantic, NLTK, and TensorRT, depending on your device requirements.


In [None]:
from tqdm import tqdm
import os
import pdfplumber
import gc
import unstructured
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from typing import Any, List
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
from langchain_community.llms import Ollama
from langchain.retrievers import MultiVectorRetriever
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.schema.runnable import RunnablePassthrough
import uuid
import logging
from contextlib import closing
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import Runnable
import chromadb
from chromadb.config import Settings
class DictRunnable(Runnable):
    def __init__(self, input_dict):
        self.input_dict = input_dict

    def invoke(self, **kwargs):
        return self.input_dict

In [None]:
llm = Ollama(model="llama2:latest")

In [None]:
class Element(BaseModel):
      type: str
      content: Any

def extract_elements_from_pdf(pdf_path):
    raw_pdf_elements = partition_pdf(
        filename=pdf_path,
        extract_images_in_pdf=True,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=1000,
        new_after_n_chars=900,
        combine_text_under_n_chars=50,
        image_output_dir_path="image/",
    )

    text_elements = []
    table_elements = []
    image_elements = []

    for element in raw_pdf_elements:
        if isinstance(element, unstructured.documents.elements.Text):
            text_elements.append(Element(type="text", content=str(element)))
        elif isinstance(element, unstructured.documents.elements.Table):
            table_elements.append(Element(type="table", content=str(element)))
        elif isinstance(element, unstructured.documents.elements.Image):
            image_elements.append(Element(type="image", content=element))
    
    return text_elements, table_elements, image_elements

def summarize_element(element):
    prompt_text = f"""
    Be concise and contain all the information of {element.type}: {element.content}
    """
    prompt = ChatPromptTemplate.from_template(prompt_text, template_format="jinja2")
    summarize_chain = {"element": lambda x: x} | prompt | llm | StrOutputParser()
    response = summarize_chain.invoke(element.content)  
    return response
    
def build_retriever_with_summaries(text_elements, table_elements, image_elements , pdf_path):
    id_key = "doc_id"
    pdf_filename = os.path.basename(pdf_path)  
    collection_name = pdf_filename.split("_")[0] 
    retriever = MultiVectorRetriever(
        vectorstore=Chroma(collection_name=collection_name, embedding_function=OllamaEmbeddings(model='nomic-embed-text')),
        docstore=InMemoryStore(),
        id_key=id_key,
    )
    
    text_summaries = [summarize_element(element) for element in text_elements]
    text_ids = [str(uuid.uuid4()) for _ in text_elements]
    summary_texts = [
        Document(page_content=s, metadata={id_key: text_ids[i]})
        for i, s in enumerate(text_summaries)
    ]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(text_ids, [element.content for element in text_elements])))

    if table_elements:  # Add this line
        table_summaries = [summarize_element(element) for element in table_elements]
        table_ids = [str(uuid.uuid4()) for _ in table_elements]
        summary_tables = [
            Document(page_content=s, metadata={id_key: table_ids[i]})
            for i, s in enumerate(table_summaries)
        ]
        retriever.vectorstore.add_documents(summary_tables)
        retriever.docstore.mset(list(zip(table_ids, [element.content for element in table_elements])))
    if image_elements:
        image_summaries = [summarize_image(element) for element in image_elements]
        image_ids = [str(uuid.uuid4()) for _ in image_elements]
        summary_images = [
        Document(page_content=s, metadata={id_key: image_ids[i]})
        for i, s in enumerate(image_summaries)
        ]
        retriever.vectorstore.add_documents(summary_images)
        retriever.docstore.mset(list(zip(image_ids, [element.content for element in image_elements])))
    logging.info("检索器构建完成")
    return retriever

In [None]:
logging.basicConfig(level=logging.INFO)

def process_file(pdf_path, question, output_file, log_file):
    logging.info(f"正在处理文件: {pdf_path}")
    text_elements, table_elements, image_elements = extract_elements_from_pdf(pdf_path)
    logging.info(f"已提取 {len(text_elements)} 个文本元素, {len(table_elements)} 个表格元素, 和 {len(image_elements)} 个图片元素")
    retriever = build_retriever_with_summaries(text_elements, table_elements, image_elements, pdf_path)
    logging.info("已构建带有摘要的检索器")
    
    response = ollama_llm(question, retriever)
    
    with open(output_file, "a") as f:
        f.write(response + "\n\n")
        
    logging.info("文件处理完成")
    with open(log_file, "a") as f:
        f.write(f"{pdf_path}\n")
    gc.collect()

def process_folder(folder_path, question, output_file, log_file):
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            processed_files = f.read().splitlines()
    else:
        processed_files = []

    pdf_paths = [os.path.join(root, file) for root, dirs, files in os.walk(folder_path) for file in files if file.endswith(".pdf")]
    pbar = tqdm(pdf_paths, desc="总体进度")

    for pdf_path in pbar:
        if pdf_path not in processed_files:
            process_file(pdf_path, question, output_file, log_file)
        pbar.refresh()

In [None]:
def ollama_llm(question, retriever):
    formatted_prompt = f"""
    Question: {question}

    Omit information not present in the context.
    Convert units for consistency. Separate entries with a clear delineation.
    Each datablock seprated by enzyme-substrate ，organism and type .
    Give the source of each data entry .
    Extract and format information about all enzyme-substrate pair mentioned in the context, following this structure:
    
    Enzyme name: 
    EC number:( or'N/A')
    Organism:( or'N/A')
    Substrate:( or'N/A')
    Type: (wild-type or mutant, specify mutation)
    Protein Identifier: (UniProt Accession Number or NCBI ID)
    Specific activity: (or Vmax)
    KM Value: (in mM, or 'N/A')
    Kcat Value: (per second, or 'N/A')
    kcat/KM: (in mM⁻¹s⁻¹, or 'N/A')
    pI Value: 
    pH Optimum:
    Temperature Optimum: (in °C)
    Molecular Weight: (in kDa)
    Reaction pH: ( or'N/A')
    Reaction Temperature: (in °C, or 'N/A')
    Buffer Solution: ( or'N/A')
    """
    prompt = PromptTemplate(template=formatted_prompt, input_variables=["question", "context"])
    chain = LLMChain(llm=llm, prompt=prompt)
    relevant_docs = retriever.get_relevant_documents(question)
    context = "\n\n".join(relevant_docs) 
    return chain.run(question=question, context=context)

In [None]:
folder_path ="/path/to/pdf/folder/"
question = "Extract detailed information about all enzymes mentioned and experimental conditions from {context} correctly. "
output_file = "output.txt"
log_file = "processed_files.log"
process_folder(folder_path, question, output_file, log_file)

## After running the code, you should be able to see the output.txt file in the specified folder.
