# Rag Literature

This Jupyter Notebook contains code and documentation for the Rag Literature project.
To start, let's import the necessary libraries:


In [None]:
import os
import pdfplumber
import shutil
import gc
from tqdm import tqdm
from datetime import datetime
from langchain_community.llms import Ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

In [None]:
def extract_text_from_pdf(pdf_path):
    texts = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                texts.append(text + "\n")
    return ''.join(texts)

class SimpleDocument:
    def __init__(self, page_content):
        self.page_content = page_content
        self.metadata = {}

def integrate_self_query_retriever(question, context):
    enhanced_context = f"Enhanced context based on self-querying logic: {context}"
    return enhanced_context



## Prompt Setting

Adjust the prompt according to the model you choose.


In [None]:

def ollama_llm(question, enhanced_context):
    formatted_prompt = f"""
    Question: {question}
    Context: {enhanced_context}
    Omit information not present in the context.
    Convert units for consistency. Separate entries with a clear delineation.
    Each datablock seprated by enzyme-substrate ，organism and type .
    Give the source of each data entry .
    Extract and format information about all enzyme-substrate pair mentioned in the context, following this structure:
    
    Enzyme name: 
    EC number:( or'N/A')
    Organism:( or'N/A')
    Substrate:( or'N/A')
    Type: (wild-type or mutant, specify mutation)
    Protein Identifier: (UniProt Accession Number or NCBI ID)
    Specific activity: (or Vmax)
    KM Value: (in mM, or 'N/A')
    Kcat Value: (per second, or 'N/A')
    kcat/KM: (in mM⁻¹s⁻¹, or 'N/A')
    pI Value: 
    pH Optimum:
    Temperature Optimum: (in °C)
    Molecular Weight: (in kDa)
    Reaction pH: ( or'N/A')
    Reaction Temperature: (in °C, or 'N/A')
    Buffer Solution: ( or'N/A')
    
    """
    response = llm.invoke(formatted_prompt)
    return response


In [None]:

def rag_chain(question, pdf_text, pdf_path):
    doc = SimpleDocument(pdf_text)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=500)
    splits = text_splitter.split_documents([doc])
    embeddings = OllamaEmbeddings(model='nomic-embed-text')
    vectorstore = FAISS.from_documents(splits, embeddings)
    docs = vectorstore.similarity_search(question, k=4)
    context = "\n\n".join([doc.page_content for doc in docs])

    enhanced_context = integrate_self_query_retriever(question, context)
    return ollama_llm(question, enhanced_context)

def process_folder(folder_path, question, output_file, log_file, problematic_folder):
    if not os.path.exists(problematic_folder):
        os.makedirs(problematic_folder)
    
    if os.path.exists(log_file):
        with open(log_file, "r") as f:
            processed_files = f.read().splitlines()
    else:
        processed_files = []
    
    pdf_paths = [os.path.join(root, file) for root, dirs, files in os.walk(folder_path) for file in files if file.endswith(".pdf")]
    
    with tqdm(total=len(pdf_paths), unit="file") as pbar:
        for pdf_path in pdf_paths:
            if pdf_path not in processed_files:
                pdf_text = extract_text_from_pdf(pdf_path)
                answer_text = rag_chain(question, pdf_text, pdf_path).strip()
                # I use "pmid" to refer to the literature, but you can adjust it in the code
                pmid = os.path.basename(pdf_path).split('_')[0]
                with open(output_file, "a") as f:
                    f.write(f"PMID: {pmid}\n{answer_text}\n\n")
        
                with open(log_file, "a") as f:
                    f.write(f"{pdf_path}\n")
                
                pbar.update(1)

## Set the model and folder


In [None]:
llm = Ollama( model="llama2")
folder_path = "/path/to/pdf/folder/"
question = "Extract detailed information about all enzymes mentioned and experimental conditions from context correctly."
output_file = "output.txt"
log_file = "processed_log.txt"
problematic_folder = "problematic_files/"
process_folder(folder_path, question, output_file, log_file, problematic_folder)