# Web Scraping and RAG Together

In [1]:
from BRAD import brad
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import chromadb
import subprocess
import logging
import os

from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

## Initialize the Database

In [2]:
docsPath='papers/'
dbName='database/'
dbPath='databases/'
HuggingFaceEmbeddingsModel = 'BAAI/bge-base-en-v1.5'
chunk_size=[700]
chunk_overlap=[200]
v=True

In [3]:
dbPath   += dbName

local = os.getcwd()  ## Get local dir
os.chdir(local)      ## shift the work dir to local dir
print('\nWork Directory: {}'.format(local)) if v else None

#%% Phase 1 - Load DB
embeddings_model = HuggingFaceEmbeddings(model_name=HuggingFaceEmbeddingsModel)
print("\nDocuments loading from: 'str(docsPath)") if v else None
text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader(docsPath,
                         glob="**/*.pdf",
                         loader_cls=PyPDFLoader, 
                         #loader_kwargs=text_loader_kwargs,
                         show_progress=True,
                         use_multithreading=True)
docs_data = loader.load()
print('\nDocuments loaded...') if v else None

for i in range(len(chunk_size)):
    for j in range(len(chunk_overlap)):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size[i],
                                                        chunk_overlap = chunk_overlap[j],
                                                        separators=[" ", ",", "\n", ". "])
        data_splits = text_splitter.split_documents(docs_data)
        print("Documents split into chunks...") if v else None
        print("Initializing Chroma Database...") if v else None

        dbName = "DB_cosine_cSize_%d_cOver_%d" %(chunk_size[i], chunk_overlap[j])

        p2_2 = subprocess.run('mkdir  %s/*'%(dbPath+dbName), shell=True)
        _client_settings = chromadb.PersistentClient(path=(dbPath+dbName))

        vectordb = Chroma.from_documents(documents           = data_splits,
                                         embedding           = embeddings_model,
                                         client              = _client_settings,
                                         collection_name     = dbName,
                                         collection_metadata = {"hnsw:space": "cosine"})
print("Database built")


Work Directory: /home/jpic/RAG-DEV/tutorials/scrape-rag





Documents loading from: 'str(docsPath)


100%|██████████| 1/1 [00:00<00:00,  1.83it/s]
mkdir: cannot create directory ‘databases/database/DB_cosine_cSize_700_cOver_200/94928492-f19b-4f7f-9a11-d4af8598b1f2’: File exists
mkdir: cannot create directory ‘databases/database/DB_cosine_cSize_700_cOver_200/chroma.sqlite3’: File exists



Documents loaded...
Documents split into chunks...
Initializing Chroma Database...
Database built


## Scratch

In [5]:
# Define paths and parameters
# dbPath = "/path/to/database/"
# dbName = "DB_cosine_cSize_512_cOver_128"  # Example name
new_docs_path = "downloaded"

# Load the new document
text_loader_kwargs = {'autodetect_encoding': True}
new_loader = DirectoryLoader(new_docs_path,
                             glob="**/*.pdf",
                             loader_cls=PyPDFLoader,
                             #loader_kwargs=text_loader_kwargs,
                             show_progress=True,
                             use_multithreading=True)
new_docs_data = new_loader.load()
print('\nNew documents loaded...')

# Split the new document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size[0],
                                                chunk_overlap=chunk_overlap[0],
                                                separators=[" ", ",", "\n", ". "])
new_data_splits = text_splitter.split_documents(new_docs_data)
print("New document split into chunks...")

100%|██████████| 1/1 [00:00<00:00,  1.99it/s]


New documents loaded...
New document split into chunks...





In [9]:
len(vectordb.get()['ids'])

83

In [10]:
new_data_splits
docs, meta = [], []
for doc in new_data_splits:
    docs.append(doc.page_content)
    meta.append(doc.metadata)

In [17]:
x = 5
def up(x):
    x = x + 1

up(x)
print(x)

5


In [19]:
new_data_splits[0]

Document(page_content='Y E A S T A N D C A N C E R \nN o b el L e ct ur e, D e c e m b er 9, 2 0 0 1 \nby \nLE L A N D H. H A R T W E L L \nFr e d  H ut c hi ns o n  C a n c er  R es e ar c h  C e nt er,  1 0 0  F airvi e w  Av e n u e  N ort h,  \nD 1- 0 6 0 S e attl e, W A 9 8 1 0 9- 2 0 2 4, U S A. \nMy r es e ar c h c ar e er h as b e e n m otiv at e d by a d esir e t o u n d erst a n d c a n c er. \nE a c h ti m e I h av e i d e nti ﬁ e d a n i ntri g ui n g as p e ct of t h e c a n c er pr o bl e m, I \nh av e f o u n d t h at it c o ul d b e a p pr o a c h e d m or e eff e ctiv ely i n t h e si m pl er e u k a- \nry oti c c ell, S acc h aro myces cere visi ae , t h a n t h e h u m a n c ell. E a c h ti m e t h e y e', metadata={'source': 'downloaded/Yeast and cancer.pdf', 'page': 0})

In [13]:
vectordb.add_texts(texts = docs,
                   meta      = meta) #[0], ids=['7'])

['7c173c95-a5da-442c-982b-4db666123870',
 '6d4cd016-0a2f-4c12-9400-bceaf432527a',
 '2fb14b76-a6ba-4754-9bf3-1c9f505e3b79',
 '4273c310-092d-4dbb-9183-8c1d937668f2',
 'a00f2673-a5f0-4935-ad78-e08ab89d7390',
 'cb133c72-2fdb-4824-8010-20fe19386680',
 '78d6b79f-d32e-4c13-9d99-d6795fb58e00',
 'e109c039-9c56-4ff0-b3cc-177ab0254cbc',
 'f520e355-e4b9-481e-8ba3-25c2f2580bcf',
 '2e5e4c78-cd2d-4c83-b354-59836ddfc594',
 '40c8545b-050d-42cf-995d-b0c9bd7ee631',
 '2d9ba0a7-315c-40cb-8322-10a5971ef18c',
 'deeb70dc-01f3-4795-a1df-314deb939b4f',
 '13a1a9ce-62a7-44f8-9d5e-bdbf3f36eda2',
 '97bdeb5f-11c7-4ad0-80f5-0ce7591d6e94',
 'a507583d-c9ed-416d-8854-741b8417e3fb',
 '7ca080f2-98bd-4aa9-9104-9b6c73fb20ed',
 'a9f97ab0-8949-4367-8f86-efcbdc625de4',
 'c07483b0-dbf5-4222-bb92-f764defe1e6b',
 'f6df35a7-4220-43fc-b162-ba56cfc2d2df',
 '3ac0c783-0df8-4b30-8f13-72c90d1985ad',
 '4f375a92-77ec-4b5d-8f0a-891a74eaff5f',
 'f064dd1a-f960-40ee-af77-777613f082c4',
 'dd2f0744-133b-4be6-a890-09526487b640',
 '2d8e1bde-a081-

In [15]:
len(vectordb.get()['ids'])

208

In [25]:
help(vectordb)

Help on Chroma in module langchain_community.vectorstores.chroma object:

class Chroma(langchain_core.vectorstores.VectorStore)
 |  Chroma(collection_name: 'str' = 'langchain', embedding_function: 'Optional[Embeddings]' = None, persist_directory: 'Optional[str]' = None, client_settings: 'Optional[chromadb.config.Settings]' = None, collection_metadata: 'Optional[Dict]' = None, client: 'Optional[chromadb.Client]' = None, relevance_score_fn: 'Optional[Callable[[float], float]]' = None) -> 'None'
 |  
 |  `ChromaDB` vector store.
 |  
 |  To use, you should have the ``chromadb`` python package installed.
 |  
 |  Example:
 |      .. code-block:: python
 |  
 |              from langchain_community.vectorstores import Chroma
 |              from langchain_community.embeddings.openai import OpenAIEmbeddings
 |  
 |              embeddings = OpenAIEmbeddings()
 |              vectorstore = Chroma("langchain_store", embeddings)
 |  
 |  Method resolution order:
 |      Chroma
 |      langchain

In [11]:
len(vectordb.get()['ids'])

83

In [9]:
vectordb.add_documents(documents=new_data_splits)#,
#                       embedding=embeddings_model)
print("New document added to the database")

ValueError: You must provide an embedding function to compute embeddings.https://docs.trychroma.com/embeddings

In [10]:
new_data_splits

[Document(page_content='Y E A S T A N D C A N C E R \nN o b el L e ct ur e, D e c e m b er 9, 2 0 0 1 \nby \nLE L A N D H. H A R T W E L L \nFr e d  H ut c hi ns o n  C a n c er  R es e ar c h  C e nt er,  1 0 0  F airvi e w  Av e n u e  N ort h,  \nD 1- 0 6 0 S e attl e, W A 9 8 1 0 9- 2 0 2 4, U S A. \nMy r es e ar c h c ar e er h as b e e n m otiv at e d by a d esir e t o u n d erst a n d c a n c er. \nE a c h ti m e I h av e i d e nti ﬁ e d a n i ntri g ui n g as p e ct of t h e c a n c er pr o bl e m, I \nh av e f o u n d t h at it c o ul d b e a p pr o a c h e d m or e eff e ctiv ely i n t h e si m pl er e u k a- \nry oti c c ell, S acc h aro myces cere visi ae , t h a n t h e h u m a n c ell. E a c h ti m e t h e y e', metadata={'source': 'downloaded/Yeast and cancer.pdf', 'page': 0}),
 Document(page_content='t h at it c o ul d b e a p pr o a c h e d m or e eff e ctiv ely i n t h e si m pl er e u k a- \nry oti c c ell, S acc h aro myces cere visi ae , t h a n t h e h u m a n c e

In [6]:
help(vectordb.add_documents)

Help on method add_documents in module langchain_core.vectorstores:

add_documents(documents: 'List[Document]', **kwargs: 'Any') -> 'List[str]' method of langchain_community.vectorstores.chroma.Chroma instance
    Run more documents through the embeddings and add to the vectorstore.
    
    Args:
        documents (List[Document]: Documents to add to the vectorstore.
    
    Returns:
        List[str]: List of IDs of the added texts.



In [14]:
vectordb.get()

{'ids': ['0840ddcf-2f73-4ad3-941e-169b1c79138a',
  '095139fc-21af-4065-84ce-b61cebd2803a',
  '0a1d6b32-cf5f-4baa-9401-c96aee7b7c91',
  '0a31153a-0f92-46f5-96dc-b340c1a04ad2',
  '0a65a9b2-27e0-46f6-93bf-7d66db9be00c',
  '13f4c634-ce93-411e-8b02-8d16fbb96f3d',
  '15ba924a-03eb-4d44-b8fa-808c8a88c925',
  '17da28f1-f412-4730-af42-14d09b97684e',
  '1ffddfc2-15e9-4009-8986-a11456c1da35',
  '25eb2f2b-e909-4a2a-928e-1168def672f3',
  '272fd338-6b17-45af-9390-b1f179436107',
  '2c15447c-6e58-4550-b265-2728063b9bad',
  '2dd92b62-62b9-4fcc-a540-aa31c112adf5',
  '33e2519c-7f9d-4f88-9778-c84578de4b0d',
  '3d0444cd-df3c-4be8-ae48-8db5b9cd5d2f',
  '3e4e4066-71a2-436a-97eb-1269f1dcaec9',
  '3f176820-3207-4848-bbe4-b3900a7b398b',
  '44e5c61d-fff0-423e-9393-b01b03d94bdd',
  '49d2e2c6-9471-483e-bf8f-4035e9c4d842',
  '4b00ef47-2edf-4f31-8147-d4566cd66c75',
  '503f7a35-8814-4643-8eb6-1537894520a7',
  '52cceeca-9074-41b5-a19b-7b17a472e3f3',
  '53cd9759-d536-497d-b034-d36847832c6f',
  '544efb88-6698-4b55-8060-

## Build the Database More

Here is the outlined pipeline:

**Step 1 (RAG)**:
Prompt: Use RAG to search literature and documents from a text database to understand the role of PCNA in the cell cycle and the pip Fucci system. Provide a summary of the findings.

**Step 2 (ROUTER)**:
Prompt: Evaluate the quality of the results from Step 1. If the results are satisfactory, proceed to Step 5. Otherwise, proceed to Step 3.

**Step 3 (SCRAPE)**:
Prompt: Search bioRxiv for recent research articles related to the cell cycle. Use the knowledge gained from Step 1 to inform the search query.

**Step 4 (ROUTER)**:
Prompt: Regardless of the outcome, always return to Step 1 to refine or expand the search.

**Step 5 (WRITE)**:
Prompt: Synthesize and summarize the entire analysis process, including the findings from Step 1, any additional insights gained from Step 3, and the iterative process of refinement. Provide a final report that encapsulates the entire analysis.

Note: The ROUTER steps are used to control the flow of the pipeline, and the WRITE step is used to summarize the results and create a final report. The dependencies between steps are indicated in the prompts.


Do you want to proceed with this plan? [Y/N/edit]

In [None]:
brad.chat(ragvectordb=vectordb)

Enter your NVIDIA API key:  ········


[32m2024-06-27 16:13:42 INFO semantic_router.utils.logger local[0m


Welcome to RAG! The chat log from this conversation will be saved to /home/jpic/BRAD/2024-06-27_16-13-18/log.json. How can I help?


Input >>  /force PLANNER make a plan with the following stages: 1. RAG, with prompt: What is the PCNA in the cell cycle and pip fucci system, 2. If the router answer is good, skip to step 5 else go to step 3. Step 3: SCRAPE PubMed for cell cycle articles. Step 4: return to step 1. Step 5: Write a summary of the process. STEP 4 should be ROUTER that always returns to step 1


2024-06-27 16:14:17,885 - INFO - PLANNER


RAG >> 0: 


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m**INSTRUCTIONS:**
You are planning a bioinformatics analysis pipeline to address a user's query. Your task is to outline a multi-step workflow using the available methods listed below. Each method should be used appropriately to ensure a thorough analysis. For each step in the pipeline, explain the goal of the process as well as provide a prompt to the chatbot that will execute that step. If information is passed between stages, such as from literature or code to databases, indicate the dependencies of steps in the prompt.

**Available Methods:**
1. **RAG**: Look up literature and documents from a text database.
2. **SCRAPE**: Search platforms like arXiv, bioRxiv, and PubMed for the latest research.
3. **DATABASE**: Utilize bioinformatics databases such as Gene Ontology and Enrichr to perform gene set enrichment analyses.
4. **CODE**: Execute bioinformatics pipelines. Utilize prebuilt p

Input >>  Y


2024-06-27 16:14:27,145 - INFO - [{'order': 0, 'module': 'PLANNER', 'prompt': None, 'description': 'This step designed the plan. It is placed in the queue because we needed a place holder for 0 indexed lists.'}, {'order': 1, 'module': 'RAG', 'prompt': '/force RAG Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.', 'description': '1 (RAG)**:\nPrompt: Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.\n\n'}, {'order': 2, 'module': 'ROUTER', 'prompt': "/force ROUTER Evaluate the quality of the summary generated in Step 1. If it's satisfactory, proceed to Step 5. Otherwise, proceed to Step 3.", 'description': "2 (ROUTER)**:\nPrompt: Evaluate the quality of the summary generated in Step 1. If it's satisfactory, proceed to Step 5. Otherwise, proceed to Step 3.\n\n"}, {'order': 3, 'module': 'SCRAPE', 'prompt': 

DEBUG
[{'order': 0, 'module': 'PLANNER', 'prompt': None, 'description': 'This step designed the plan. It is placed in the queue because we needed a place holder for 0 indexed lists.'}, {'order': 1, 'module': 'RAG', 'prompt': '/force RAG Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.', 'description': '1 (RAG)**:\nPrompt: Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.\n\n'}, {'order': 2, 'module': 'ROUTER', 'prompt': "/force ROUTER Evaluate the quality of the summary generated in Step 1. If it's satisfactory, proceed to Step 5. Otherwise, proceed to Step 3.", 'description': "2 (ROUTER)**:\nPrompt: Evaluate the quality of the summary generated in Step 1. If it's satisfactory, proceed to Step 5. Otherwise, proceed to Step 3.\n\n"}, {'order': 3, 'module': 'SCRAPE', 'prompt': '/force SCRAPE Use SCRAPE t

  warn_deprecated(




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
,CcdB,carried
bystablemini-FplasmidsinhibitsE.colicelldivisionwhen
replicationoftheplasmidisinhibited.Cellscontainingplasmids
lackingthisfunctionproduceplasmid-freecellsatmuchhigher
frequenciesthanthosecontainingthisfunction.
SomeEmbryonicCellCyclesLackSome
Checkpoints
Manyexperimentalresultssuggestthatsomeearlyembryoniccell
cyclesarecontrolleddifferentlythansomaticcellcycles(11);the
differencesmaybeattributabletotheexistenceoffewercheckpoints
insomeembryoniccellcycles.
Perhapsthemostdefinitiveevidenceforadifferenceinthecontrol
ofthecellcyclebetweensomaticcellsandsomeembryonicsystems
arisesfromtheconsequences

B Fig.2.DNAligase-de-
1|11-Q**4;@000X0000fectivecellsarearrest

2024-06-27 16:14:28,665 - INFO - 


route



2024-06-27 16:14:28,667 - INFO - RAG
2024-06-27 16:14:28,668 - INFO - [{'order': 0, 'module': 'PLANNER', 'prompt': None, 'description': 'This step designed the plan. It is placed in the queue because we needed a place holder for 0 indexed lists.'}, {'order': 1, 'module': 'RAG', 'prompt': '/force RAG Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.', 'description': '1 (RAG)**:\nPrompt: Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.\n\n'}, {'order': 2, 'module': 'ROUTER', 'prompt': "/force ROUTER Evaluate the quality of the summary generated in Step 1. If it's satisfactory, proceed to Step 5. Otherwise, proceed to Step 3.", 'description': "2 (ROUTER)**:\nPrompt: Evaluate the quality of the summary generated in Step 1. If it's satisfactory, proceed to Step 5


[1m> Finished chain.[0m

[1m> Finished chain.[0m
I don't know. The provided text does not contain any information about PCNA in the cell cycle or the pip Fucci system. The text appears to discuss cell division, cell cycles, and checkpoints, but it does not mention PCNA or the pip Fucci system specifically.
Sources:
Checkpoints Controls That Ensure the Order of Cell Cycle Events - Hartwell.pdf
DEBUG



route



DEBUG
RAG
DEBUG
[{'order': 0, 'module': 'PLANNER', 'prompt': None, 'description': 'This step designed the plan. It is placed in the queue because we needed a place holder for 0 indexed lists.'}, {'order': 1, 'module': 'RAG', 'prompt': '/force RAG Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.', 'description': '1 (RAG)**:\nPrompt: Use RAG to search the text database for information on PCNA in the cell cycle and pip Fucci system, providing a brief summary of the findings.\n\n'}, {'o

2024-06-27 16:14:30,287 - INFO - content='**OUTPUT**\nNext Step=3\nREASONING=I chose Step 3 because the Router evaluated the quality of the summary generated in Step 1 and found it to be unsatisfactory. Therefore, according to the pipeline plan, I should proceed to Step 3 to search PubMed for recent research articles related to the cell cycle.' response_metadata={'role': 'assistant', 'content': '**OUTPUT**\nNext Step=3\nREASONING=I chose Step 3 because the Router evaluated the quality of the summary generated in Step 1 and found it to be unsatisfactory. Therefore, according to the pipeline plan, I should proceed to Step 3 to search PubMed for recent research articles related to the cell cycle.', 'token_usage': {'prompt_tokens': 575, 'total_tokens': 644, 'completion_tokens': 69}, 'model_name': 'meta/llama3-70b-instruct'} id='run-dd0415c8-bd9f-4252-ba7f-caf78a1021fb-0' role='assistant'
2024-06-27 16:14:30,288 - INFO - **OUTPUT**
Next Step=3
REASONING=I chose Step 3 because the Router eva

DEBUG
content='**OUTPUT**\nNext Step=3\nREASONING=I chose Step 3 because the Router evaluated the quality of the summary generated in Step 1 and found it to be unsatisfactory. Therefore, according to the pipeline plan, I should proceed to Step 3 to search PubMed for recent research articles related to the cell cycle.' response_metadata={'role': 'assistant', 'content': '**OUTPUT**\nNext Step=3\nREASONING=I chose Step 3 because the Router evaluated the quality of the summary generated in Step 1 and found it to be unsatisfactory. Therefore, according to the pipeline plan, I should proceed to Step 3 to search PubMed for recent research articles related to the cell cycle.', 'token_usage': {'prompt_tokens': 575, 'total_tokens': 644, 'completion_tokens': 69}, 'model_name': 'meta/llama3-70b-instruct'} id='run-dd0415c8-bd9f-4252-ba7f-caf78a1021fb-0' role='assistant'
DEBUG
**OUTPUT**
Next Step=3
REASONING=I chose Step 3 because the Router evaluated the quality of the summary generated in Step 1 

2024-06-27 16:14:31,277 - INFO - searching on PUBMED...
2024-06-27 16:14:31,278 - INFO - Search Terms: ['cell cycle', 'PCNA', 'Fucci system', 'DNA replication', 'chromatin replication', 'cell division', 'cyclin-dependent kinases', 'spindle assembly', 'mitosis', 'cell proliferation']



[1m> Finished chain.[0m
DEBUG
searching on PUBMED...
DEBUG
Search Terms: ['cell cycle', 'PCNA', 'Fucci system', 'DNA replication', 'chromatin replication', 'cell division', 'cyclin-dependent kinases', 'spindle assembly', 'mitosis', 'cell proliferation']


2024-06-27 16:14:31,622 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:14:39,714 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:14:40,005 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:14:59,066 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:14:59,317 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:15:14,712 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:15:15,577 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:15:33,858 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:15:34,258 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:15:42,954 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:15:43,829 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:15:53,105 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:15:53,387 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:16:09,050 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:16:09,332 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:16:32,117 - INFO - pdf collection complete!


DEBUG
pdf collection complete!


2024-06-27 16:16:32,364 - INFO - Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


DEBUG
Directory '/home/jpic/BRAD/2024-06-27_16-13-18/pdf' created successfully


2024-06-27 16:16:43,620 - INFO - Error occurred while searching on PUBMED: 'NoneType' object has no attribute 'attrs'


DEBUG
Error occurred while searching on PUBMED: 'NoneType' object has no attribute 'attrs'


100%|██████████| 38/38 [00:18<00:00,  2.00it/s]



New documents loaded...
New document split into chunks...


In [7]:
vectordb.similarity_search("hartwell")

[Document(page_content='Ilit._.Radiat.Biol.11,1(1966).\n57.M.Freeman,C.Nusslein-Volhard, D.M.Glov,er,Cell46,457(1986).\n58.J.CulottiandL.H.Hartwell,Exp.CellRes.67,389(1971).\n59.K.Hara,P.Tvdeman,M.Kirschner,Proc.Nadl.Acad.Sci.U.S.A.77,462(1980).\n60.E.Schierenberg andW.B.Wood,Dev.Biol.107,337(1985).\n61.C.Killian,C.Bland,J.Kuzav,a,D.Nishioka,Exp.CellRes.158,519(1985).\n62.G.Sluder,F.J.Miller,C.L.Rider,J.CellBiol.103,1873(1986).\n63.C.J.Epstcin,ThieConisequences', metadata={'page': 5, 'source': 'papers/Checkpoints Controls That Ensure the Order of Cell Cycle Events - Hartwell.pdf'}),
 Document(page_content=',ibid.44,65(1986).\n21.C.Hohm,T.Goto,J.C.Wang,D.Botstein,ibid.41,553(1985).\n22.D.KoshlandandL.H.Hartwell,Science238,1713(1987).\n23.P.B.Berget,inVirusStructureandAssembly,S.Casjens,Ed.(JonesandBartlett,\nBoston,1985),pp.149-168.\n24.P.B.BergetandJ.King,inBacteriophageT4,C.K.Mathews,E.M.Kutter,G.\nMosig,P.B.Berget,Eds.(AmericanSocietyforMicrobiology,Washington,DC,\n1983),pp.246-258.\

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.output_parsers import CommaSeparatedListOutputParser
from semantic_router.layer import RouteLayer
from langchain.chains import ConversationChain
from langchain.prompts import PromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain.docstore.document import Document
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from sklearn.metrics.pairwise import cosine_similarity
from langchain_text_splitters import CharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity