### **Data ingestion using langchain and chomeDB**

In [1]:
#!pip install langchain
#!pip install chromadb
#!pip install transformers
#!pip install -qU "langchain-chroma>=0.1.2"
#!pip install scholarly
#!pip install pdfkit
#!pip install    langchain pymupdf
#!pip install -U langchain-community
#!pip install --upgrade langchain chromadb
#!pip install -qU langchain-huggingface

### **Import dependencies**

In [2]:
import os, requests, pdfkit, torch, chromadb

from chromadb import Client
from transformers import AutoTokenizer, AutoModel
from langchain_chroma import Chroma
from scholarly import scholarly
from uuid import uuid4
from tqdm import tqdm
from langchain_core.documents import Document
from pathlib import Path
from langchain.document_loaders import PyMuPDFLoader

### **Innital embedding and vectorDB**

- Init chomeDB

In [3]:
chroma_client = Client()

- Initialize embedding model

In [4]:
# Load a pre-trained transformer model for embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [5]:
# Function to create embeddings
from langchain.embeddings import HuggingFaceEmbeddings

class EmbeddingFunction():
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def embed_documents(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embeddings.numpy()
    
    def embed_query(self, query):
        inputs = self.tokenizer([query], padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            embedding = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embedding.numpy().squeeze()

embedding_function = EmbeddingFunction(model=model, tokenizer=tokenizer)

- Integrate chomaDB with langchain

In [6]:
vector_store = Chroma(
    collection_name="rag_4_researcher_collection",
    embedding_function=embedding_function,
    persist_directory="../database/vector-db/",
)

### **Fetch data**

In [7]:
def download_pdf(url, title, output_dir="../database/document"):
    try:
        os.makedirs(output_dir, exist_ok=True)
        safe_title = "".join(c for c in title if c.isalnum() or c in (" ", "_")).rstrip()
        file_path = os.path.join(output_dir, f"{safe_title}.pdf")

        response = requests.get(url, stream=True)
        if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
            with open(file_path, 'wb') as pdf_file:
                for chunk in response.iter_content(chunk_size=1024):
                    pdf_file.write(chunk)
            print(f"Downloaded: {file_path}")
            return True
        else:
            print(f"Failed to download PDF from {url}")
            return False
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

def convert_webpage_to_pdf(url, output_dir="../database/documents", filename="converted_page"):
    filename += str(uuid4()) + ".pdf"
    try:
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, filename)

        # Convert webpage to PDF
        pdfkit.from_url(url, file_path)
        print(f"PDF successfully saved at: {file_path}")
        return file_path
    except Exception as e:
        print(f"Error converting URL to PDF: {e}")
        return str(e)


def get_scholar_urls(query, max_results=10):
    try:
        # Search for the query
        search_results = scholarly.search_pubs(query)
        urls = []
        count = 0

        for result in search_results:
            if count >= max_results:
                break
            count += 1

            # Get the title and URL
            title = result.get('bib', {}).get('title', 'No title available')
            url = result.get('eprint_url') or result.get('pub_url') or 'No URL available'

            if url != 'No URL available':
                urls.append({"title": title, "url": url})

        return urls
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [21]:
search_term = "LLM in genomics"
max_results = 10

# Get search results
results = get_scholar_urls(search_term, max_results=max_results)

# Process each result
for idx, article in tqdm(enumerate(results), desc="Scrapping ... "):
    title = article["title"]
    url = article["url"]
    #print(f"{idx+1}. {title}\nURL: {url}\n")

    # Check if the URL points to a PDF
    if "/pdf/" in url or url.endswith(".pdf"):
        download_pdf(url, title)
    # elif "sciencedirect.com" in url:
    #     print(f"Skipping ScienceDirect PDF download for now: {url}")
    #else:
    #    convert_webpage_to_pdf(url, output_dir="../database/document", filename="")

Scrapping ... : 1it [00:00,  2.26it/s]

Downloaded: ../database/document/GenoTEX A Benchmark for Evaluating LLMBased Exploration of Gene Expression Data in Alignment with Bioinformaticians.pdf


Scrapping ... : 2it [00:02,  1.65s/it]

Downloaded: ../database/document/Phenomics Assistant An Interface for LLMbased Biomedical Knowledge Graph Exploration.pdf


Scrapping ... : 3it [00:03,  1.34s/it]

Downloaded: ../database/document/GPGPT Large Language Model for GenePhenotype Mapping.pdf


Scrapping ... : 4it [00:07,  2.20s/it]

Downloaded: ../database/document/BioinfoBench A Simple Benchmark Framework for LLM Bioinformatics Skills Evaluation.pdf


Scrapping ... : 5it [00:11,  2.86s/it]

Downloaded: ../database/document/Leveraging genomic large language models to enhance causal genotypebrainclinical pathways in Alzheimers disease.pdf


Scrapping ... : 7it [00:16,  2.72s/it]

Downloaded: ../database/document/Genetic Transformer An Innovative Large Language Model Driven Approach for Rapid and Accurate Identification of Causative Variants in Rare Genetic Diseases.pdf


Scrapping ... : 8it [00:17,  2.16s/it]

Downloaded: ../database/document/An llmbased knowledge synthesis and scientific reasoning framework for biomedical discovery.pdf


Scrapping ... : 10it [00:20,  2.05s/it]

Downloaded: ../database/document/Geneverse A collection of Opensource Multimodal Large Language Models for Genomic and Proteomic Research.pdf





### **Extract Text from pdf**

In [8]:
directory_path = Path("../database/document")
pdf_files = directory_path.glob("*.pdf")
documents = []

for file_path in pdf_files:
    try:
        loader = PyMuPDFLoader(file_path=str(file_path))
        loaded_docs = loader.load()
        documents.extend(loaded_docs)
        print(f"Processed file: {file_path}")
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")

print(f"Processed {len(documents)} documents.")

Processed file: ../database/document/2407.11435v1.pdf
Processed 25 documents.


In [9]:
# for chunking we will see latter

In [10]:
documents[0].model_dump().keys()

dict_keys(['id', 'metadata', 'page_content', 'type'])

In [11]:
documents[0].metadata

{'source': '../database/document/2407.11435v1.pdf',
 'file_path': '../database/document/2407.11435v1.pdf',
 'page': 0,
 'total_pages': 25,
 'format': 'PDF 1.5',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'LaTeX with hyperref',
 'producer': 'pdfTeX-1.40.25',
 'creationDate': 'D:20240717002846Z',
 'modDate': 'D:20240717002846Z',
 'trapped': ''}

In [12]:
uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)

['87c71949-68f2-4b70-97aa-652669ee92be',
 '0495e4c0-6cad-408e-84bc-63fbf2b83be5',
 '5e3c8eaf-3766-418f-846f-3e7554ffb8dc',
 'd747312a-3240-4041-a088-a813f416d8ec',
 '5c96395c-7c5c-4734-b826-4273c7e68aad',
 'fe3e580c-542e-4d6c-aa29-932c4f9174e8',
 'a1a04faa-4bbb-40d3-b91a-5d832a61d5ec',
 'cbd1d0d7-5ef3-42ec-8f21-8a57c98bbe4b',
 'b2390c6a-5823-4caa-a448-8f95786c6dce',
 '5599221b-d0d8-4bf0-96b5-4cfd95465bc2',
 'f7ec33bb-3cba-4864-ae65-ab00ef122f72',
 '129f9678-d321-4130-8646-cbece1fef10c',
 '7bd4957f-f6a0-4415-8f4e-10d33da7c993',
 'b9b05df1-6057-405f-9570-89a56e29aedc',
 '0e3c868b-3072-4a0b-9f09-ec477f27552b',
 'c3db0347-7471-4dce-a4fe-dcb5e84c2530',
 '333a4ce5-e9dc-4985-93e8-4131b50fe044',
 'ec73c1de-8125-47f0-a0bd-fc7b6b3f33c3',
 '47646f1f-46e6-429f-b51d-5a86a2fab1f4',
 '21a1cdba-e9f7-41ae-8594-f8dd060a9fdb',
 'a5cc8a19-b1dc-4d25-a2c1-814a7562c262',
 '8f25fcdd-bb26-400b-aa54-9293752d3e5c',
 '21178bfa-b952-493a-b79c-26f35a7a04d9',
 '739a55c8-eab0-4699-803e-190b0d50f74b',
 '91659de5-a28c-

- similarity search

In [13]:
results = vector_store.similarity_search(
    "LLM for genes prediction",
    k=2,
    #filter={"source": "tweet"},
)
for res in results:
    #print(f"* {res.page_content} [{res.metadata}]")
    print(f"* [{res.metadata}]")

* [{'author': '', 'creationDate': 'D:20240717002846Z', 'creator': 'LaTeX with hyperref', 'file_path': '../database/document/2407.11435v1.pdf', 'format': 'PDF 1.5', 'keywords': '', 'modDate': 'D:20240717002846Z', 'page': 4, 'producer': 'pdfTeX-1.40.25', 'source': '../database/document/2407.11435v1.pdf', 'subject': '', 'title': '', 'total_pages': 25, 'trapped': ''}]
* [{'author': '', 'creationDate': 'D:20240717002846Z', 'creator': 'LaTeX with hyperref', 'file_path': '../database/document/2407.11435v1.pdf', 'format': 'PDF 1.5', 'keywords': '', 'modDate': 'D:20240717002846Z', 'page': 10, 'producer': 'pdfTeX-1.40.25', 'source': '../database/document/2407.11435v1.pdf', 'subject': '', 'title': '', 'total_pages': 25, 'trapped': ''}]


In [14]:
# END