### **Data ingestion using langchain and chomeDB**

In [1]:
#!pip install langchain
#!pip install chromadb
#!pip install transformers
#!pip install -qU "langchain-chroma>=0.1.2"
#!pip install scholarly
#!pip install pdfkit
#!pip install langchain pymupdf
#!pip install -U langchain-community
#!pip install --upgrade langchain chromadb
#!pip install -qU langchain-huggingface

### **Import dependencies**

In [2]:
import os, requests, pdfkit, torch, chromadb

from chromadb import Client
from transformers import AutoTokenizer, AutoModel
from langchain_chroma import Chroma
from scholarly import scholarly
from uuid import uuid4
from tqdm import tqdm
from langchain_core.documents import Document
from pathlib import Path
from langchain.document_loaders import PyMuPDFLoader

### **Innital embedding and vectorDB**

- Init chomeDB

In [3]:
chroma_client = Client()

- Initialize embedding model

In [4]:
# Load a pre-trained transformer model for embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [16]:
# Function to create embeddings
from langchain.embeddings import HuggingFaceEmbeddings

class EmbeddingFunction():
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def embed_documents(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings.numpy()
    
    def embed_query(self, query):
        inputs = self.tokenizer([query], padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            embedding = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embedding.numpy().squeeze()

embedding_function = EmbeddingFunction(model=model, tokenizer=tokenizer)

- Integrate chomaDB with langchain

In [17]:
vector_store = Chroma(
    collection_name="rag_4_researcher_collection",
    embedding_function=embedding_function,
    persist_directory="../database/vector-db/",
)

### **Fetch data**

In [7]:
def download_pdf(url, title, output_dir="../database/document"):
    try:
        os.makedirs(output_dir, exist_ok=True)
        safe_title = "".join(c for c in title if c.isalnum() or c in (" ", "_")).rstrip()
        file_path = os.path.join(output_dir, f"{safe_title}.pdf")

        response = requests.get(url, stream=True)
        if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
            with open(file_path, 'wb') as pdf_file:
                for chunk in response.iter_content(chunk_size=1024):
                    pdf_file.write(chunk)
            print(f"Downloaded: {file_path}")
            return True
        else:
            print(f"Failed to download PDF from {url}")
            return False
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

def convert_webpage_to_pdf(url, output_dir="../database/documents", filename="converted_page"):
    filename += str(uuid4()) + ".pdf"
    try:
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, filename)

        # Convert webpage to PDF
        pdfkit.from_url(url, file_path)
        print(f"PDF successfully saved at: {file_path}")
        return file_path
    except Exception as e:
        print(f"Error converting URL to PDF: {e}")
        return str(e)


def get_scholar_urls(query, max_results=10):
    try:
        # Search for the query
        search_results = scholarly.search_pubs(query)
        urls = []
        count = 0

        for result in search_results:
            if count >= max_results:
                break
            count += 1

            # Get the title and URL
            title = result.get('bib', {}).get('title', 'No title available')
            url = result.get('eprint_url') or result.get('pub_url') or 'No URL available'

            if url != 'No URL available':
                urls.append({"title": title, "url": url})

        return urls
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [21]:
search_term = "LLM in genomics"
max_results = 10

# Get search results
results = get_scholar_urls(search_term, max_results=max_results)

# Process each result
for idx, article in tqdm(enumerate(results), desc="Scrapping ... "):
    title = article["title"]
    url = article["url"]
    #print(f"{idx+1}. {title}\nURL: {url}\n")

    # Check if the URL points to a PDF
    if "/pdf/" in url or url.endswith(".pdf"):
        download_pdf(url, title)
    # elif "sciencedirect.com" in url:
    #     print(f"Skipping ScienceDirect PDF download for now: {url}")
    #else:
    #    convert_webpage_to_pdf(url, output_dir="../database/document", filename="")

Scrapping ... : 1it [00:00,  2.26it/s]

Downloaded: ../database/document/GenoTEX A Benchmark for Evaluating LLMBased Exploration of Gene Expression Data in Alignment with Bioinformaticians.pdf


Scrapping ... : 2it [00:02,  1.65s/it]

Downloaded: ../database/document/Phenomics Assistant An Interface for LLMbased Biomedical Knowledge Graph Exploration.pdf


Scrapping ... : 3it [00:03,  1.34s/it]

Downloaded: ../database/document/GPGPT Large Language Model for GenePhenotype Mapping.pdf


Scrapping ... : 4it [00:07,  2.20s/it]

Downloaded: ../database/document/BioinfoBench A Simple Benchmark Framework for LLM Bioinformatics Skills Evaluation.pdf


Scrapping ... : 5it [00:11,  2.86s/it]

Downloaded: ../database/document/Leveraging genomic large language models to enhance causal genotypebrainclinical pathways in Alzheimers disease.pdf


Scrapping ... : 7it [00:16,  2.72s/it]

Downloaded: ../database/document/Genetic Transformer An Innovative Large Language Model Driven Approach for Rapid and Accurate Identification of Causative Variants in Rare Genetic Diseases.pdf


Scrapping ... : 8it [00:17,  2.16s/it]

Downloaded: ../database/document/An llmbased knowledge synthesis and scientific reasoning framework for biomedical discovery.pdf


Scrapping ... : 10it [00:20,  2.05s/it]

Downloaded: ../database/document/Geneverse A collection of Opensource Multimodal Large Language Models for Genomic and Proteomic Research.pdf





### **Extract Text from pdf**

In [8]:
directory_path = Path("../database/document")
pdf_files = directory_path.glob("*.pdf")
documents = []

for file_path in pdf_files:
    try:
        loader = PyMuPDFLoader(file_path=str(file_path))
        loaded_docs = loader.load()
        documents.extend(loaded_docs)
        print(f"Processed file: {file_path}")
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")

print(f"Processed {len(documents)} documents.")

Processed file: ../database/document/Genetic Transformer An Innovative Large Language Model Driven Approach for Rapid and Accurate Identification of Causative Variants in Rare Genetic Diseases.pdf
Processed file: ../database/document/GenoTEX A Benchmark for Evaluating LLMBased Exploration of Gene Expression Data in Alignment with Bioinformaticians.pdf
Processed file: ../database/document/Geneverse A collection of Opensource Multimodal Large Language Models for Genomic and Proteomic Research.pdf
Processed file: ../database/document/Leveraging genomic large language models to enhance causal genotypebrainclinical pathways in Alzheimers disease.pdf
Processed file: ../database/document/Phenomics Assistant An Interface for LLMbased Biomedical Knowledge Graph Exploration.pdf
Processed file: ../database/document/GPGPT Large Language Model for GenePhenotype Mapping.pdf
Processed file: ../database/document/An llmbased knowledge synthesis and scientific reasoning framework for biomedical discover

In [9]:
# for chunking we will see latter

In [10]:
documents[0].model_dump().keys()

dict_keys(['id', 'metadata', 'page_content', 'type'])

In [14]:
documents[50].metadata

{'source': '../database/document/Geneverse A collection of Opensource Multimodal Large Language Models for Genomic and Proteomic Research.pdf',
 'file_path': '../database/document/Geneverse A collection of Opensource Multimodal Large Language Models for Genomic and Proteomic Research.pdf',
 'page': 7,
 'total_pages': 17,
 'format': 'PDF 1.5',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'LaTeX with hyperref',
 'producer': 'pdfTeX-1.40.25',
 'creationDate': 'D:20240625000512Z',
 'modDate': 'D:20240625000512Z',
 'trapped': ''}

In [12]:
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)

['c5b7db8b-8da2-485c-af22-a2e0851c4d15',
 '48d49738-80c1-445a-9d73-774ab8da9e1c',
 'c5cb35fc-45a9-49eb-8f8a-0e325bb30d13',
 '78341470-3704-484d-8ce3-bf73c144afed',
 'ed08836f-41ac-437a-9fea-cbda3b7c103b',
 'a79e0eee-eee4-478f-bf32-d51c4a781460',
 '2c743f6f-d61b-4984-ab0c-9c71f044a7f8',
 'd2faf862-0bec-4690-a6f5-8a932743b261',
 '020252a7-9e0d-4934-87ab-9bdb5ef83389',
 '88628658-112f-494f-8196-bc458a6594e3',
 '59e8eb3f-963f-4f09-be29-1656a6051f1c',
 '80133f0d-a6b5-441b-add5-8320a559558b',
 '2ae000d7-b49a-4694-b405-cfc37ae8b377',
 '153a886f-971f-49d0-a4e5-df45e2eae966',
 '7db99cca-9691-42a2-aeaa-ef6a32f339fa',
 '53a3cd0d-30c1-4790-940c-96fdd54136a4',
 '6533931a-dc1f-44bb-afc3-038b55b3e337',
 'a38c65fe-0db2-4e57-a898-41cadf876891',
 '23823771-0503-48ea-a841-6f08af7f58fd',
 '474a92cc-a407-4c76-8b44-d82655950d4f',
 'ca3d4a52-2628-4efb-a58d-6630bbfab748',
 'da400d26-85d5-4b74-9e34-3dbbfbb4a846',
 'd9312c01-b8c1-470a-a6b7-5bddf205bdb5',
 '8edb2088-0424-40a4-8dd7-02510eb92a9e',
 '6efc0424-182c-

- similarity search

In [19]:
results = vector_store.similarity_search(
    "LLM for genes prediction",
    k=2,
    #filter={"source": "tweet"},
)
for res in results:
    #print(f"* {res.page_content} [{res.metadata}]")
    print(f"* [{res.metadata}]")

* [{'author': '', 'creationDate': 'D:20240624005248Z', 'creator': 'LaTeX with hyperref', 'file_path': '../database/document/GenoTEX A Benchmark for Evaluating LLMBased Exploration of Gene Expression Data in Alignment with Bioinformaticians.pdf', 'format': 'PDF 1.5', 'keywords': '', 'modDate': 'D:20240624005248Z', 'page': 7, 'producer': 'pdfTeX-1.40.25', 'source': '../database/document/GenoTEX A Benchmark for Evaluating LLMBased Exploration of Gene Expression Data in Alignment with Bioinformaticians.pdf', 'subject': '', 'title': '', 'total_pages': 25, 'trapped': ''}]
* [{'author': '', 'creationDate': 'D:20240624005248Z', 'creator': 'LaTeX with hyperref', 'file_path': '../database/document/GenoTEX A Benchmark for Evaluating LLMBased Exploration of Gene Expression Data in Alignment with Bioinformaticians.pdf', 'format': 'PDF 1.5', 'keywords': '', 'modDate': 'D:20240624005248Z', 'page': 8, 'producer': 'pdfTeX-1.40.25', 'source': '../database/document/GenoTEX A Benchmark for Evaluating LLMB

In [20]:
!pip install ollama

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting ollama
  Downloading ollama-0.4.0-py3-none-any.whl.metadata (4.8 kB)
Downloading ollama-0.4.0-py3-none-any.whl (12 kB)
Installing collected packages: ollama
Successfully installed ollama-0.4.0


In [None]:
# END