# Retrieval Augmented Generation (RAG) with BRAD

Given a collection of documents (pdfs), the RAG first builds a database by splitting up the text into equal size ('chunk_size'). These chunks can optionally be set to have some overlap ('chunk_overlap'). Both of these values within BRAD are set to 700 and 200 respectively but can be manually changed for different applications. Then, we vectorize these chunks using an embedding model from HuggingFace (Note - this step may take a while). Then, given a query, it embeds the query in the same embedding space and finds the top k chunks (preset to 4) with the closest cosine similarity to the query in the embedding space and uses these chunks as a basis for the response.

# Literature Databases

## Building a Database

In [19]:
import subprocess
import os
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [1]:
from BRAD import rag

[nltk_data] Downloading package words to /home/jpic/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jpic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jpic/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [17]:
docsPath='papers/'
dbName='database'
dbPath='databases/'
HuggingFaceEmbeddingsModel = 'BAAI/bge-base-en-v1.5'
chunk_size=[700]
chunk_overlap=[200]
v=False

In [8]:
local = os.getcwd()  ## Get local dir
os.chdir(local)      ## shift the work dir to local dir

print('\nWork Directory: {}'.format(local)) if v else None

#%% Phase 1 - Load DB
embeddings_model = HuggingFaceEmbeddings(model_name=HuggingFaceEmbeddingsModel)

print('\nDocuments loading from:', docsPath) if v else None

In [12]:
text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader(docsPath,
                         glob="**/*.pdf",
                         loader_cls=PyPDFLoader, 
                         show_progress=True,
                         use_multithreading=True)
docs_data = loader.load()



100%|██████████| 1/1 [03:40<00:00, 220.80s/it]


100%|██████████| 1/1 [00:01<00:00,  1.97s/it][A[A


In [20]:
print('\nDocuments loaded...') if v else None

for i in range(len(chunk_size)):
    for j in range(len(chunk_overlap)):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size[i],
                                                        chunk_overlap = chunk_overlap[j],
                                                        separators=[" ", ",", "\n", ". "])
        data_splits = text_splitter.split_documents(docs_data)
        
        print('Documents split into chunks...') if v else None
        print('Initializing Chroma Database...') if v else None

        dbName = "DB_cosine_cSize_%d_cOver_%d" %(chunk_size[i], chunk_overlap[j])

        p2_2 = subprocess.run('mkdir  %s/*'%(dbPath+dbName), shell=True)
        _client_settings = chromadb.PersistentClient(path=(dbPath+dbName))

        vectordb = Chroma.from_documents(documents           = data_splits,
                                         embedding           = embeddings_model,
                                         client              = _client_settings,
                                         collection_name     = dbName,
                                         collection_metadata = {"hnsw:space": "cosine"})

        print('Completed Chroma Database: ', dbName) if v else None
        del vectordb, text_splitter, data_splits

mkdir: cannot create directory ‘databases/DB_cosine_cSize_700_cOver_200/*’: No such file or directory


In [2]:
rag.create_database()

  0%|          | 0/1 [00:00<?, ?it/s]Error loading file papers/Pore-C.pdf


TypeError: PyPDFLoader.__init__() got an unexpected keyword argument 'autodetect_encoding'

In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = UnstructuredPDFLoader('papers/Pore-C.pdf').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, embeddings_model)


NameError: name 'UnstructuredPDFLoader' is not defined

In [5]:
import subprocess
import os
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb

In [6]:
docsPath='papers/'
dbName='tutorialDatabase'
dbPath='databases/'
HuggingFaceEmbeddingsModel = 'BAAI/bge-base-en-v1.5'
chunk_size=[700]
chunck_overlap=[200]
v=True

In [7]:
dbPath   += dbName

local = os.getcwd()  ## Get local dir
os.chdir(local)      ## shift the work dir to local dir

print('\nWork Directory: {}'.format(local)) if v else None

#%% Phase 1 - Load DB
embeddings_model = HuggingFaceEmbeddings(model_name=HuggingFaceEmbeddingsModel)

print('\nDocuments loading from:', docsPath) if v else None

text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader(docsPath,
                         glob="**/*.pdf",
                         loader_cls=UnstructuredPDFLoader, 
                         #loader_kwargs=text_loader_kwargs,
                         show_progress=True,
                         )
# docs_data = loader.load()


Work Directory: /home/jpic/RAG-DEV/tutorials/RAG-with-BRAD





Documents loading from: papers/


In [8]:
docs_data = loader.load()


  0%|          | 0/1 [00:00<?, ?it/s][AError loading file papers/Pore-C.pdf


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [7]:
import pdfminer
help(pdfminer)
from pdfminer import psparser

Help on package pdfminer:

NAME
    pdfminer

PACKAGE CONTENTS
    _saslprep
    arcfour
    ascii85
    ccitt
    cmapdb
    converter
    data_structures
    encodingdb
    fontmetrics
    glyphlist
    high_level
    image
    jbig2
    latin_enc
    layout
    lzw
    pdfcolor
    pdfdevice
    pdfdocument
    pdffont
    pdfinterp
    pdfpage
    pdfparser
    pdftypes
    psparser
    runlength
    settings
    utils

FILE
    (built-in)




### WIKI Retrieval

In [34]:
from langchain_community.retrievers import WikipediaRetriever
retriever = WikipediaRetriever(top_k_results=10)

In [28]:
help(retriever)

Help on WikipediaRetriever in module langchain_community.retrievers.wikipedia object:

class WikipediaRetriever(langchain_core.retrievers.BaseRetriever, langchain_community.utilities.wikipedia.WikipediaAPIWrapper)
 |  WikipediaRetriever(*, wiki_client: Any = None, top_k_results: int = 3, lang: str = 'en', load_all_available_meta: bool = False, doc_content_chars_max: int = 4000, name: Optional[str] = None, tags: Optional[List[str]] = None, metadata: Optional[Dict[str, Any]] = None) -> None
 |  
 |  `Wikipedia API` retriever.
 |  
 |  It wraps load() to get_relevant_documents().
 |  It uses all WikipediaAPIWrapper arguments without any change.
 |  
 |  Method resolution order:
 |      WikipediaRetriever
 |      langchain_core.retrievers.BaseRetriever
 |      langchain_core.runnables.base.RunnableSerializable
 |      langchain_core.load.serializable.Serializable
 |      langchain_community.utilities.wikipedia.WikipediaAPIWrapper
 |      pydantic.v1.main.BaseModel
 |      pydantic.v1.utils

In [35]:
docs = retriever.invoke('kronecker product')

In [36]:
len(docs)

10

### Building ONLINE DB TUTORIAL

In [10]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [19]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()


In [21]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()


NameError: name 'OpenAIEmbeddings' is not defined

In [20]:
docs

[Document(page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final re

In [2]:
import poppler #-utils

In [3]:
import pdfinfo

ModuleNotFoundError: No module named 'pdfinfo'

In [1]:
from pdf2image import *
images = convert_from_path('papers/Pore-C.pdf')


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [20]:
!apt-get update

/bin/bash: apt-get: command not found


In [3]:
rag.create_database(docsPath='papers/',
                    dbName='tutorialDatabase',
                    dbPath='databases/',
                    HuggingFaceEmbeddingsModel = 'BAAI/bge-base-en-v1.5',
                    chunk_size=[700],
                    chunck_overlap=[200],
                    v=True)


Work Directory: /home/jpic/RAG-DEV/tutorials/RAG-with-BRAD





Documents loading from: papers/


  0%|          | 0/1 [00:00<?, ?it/s]Error loading file papers/Pore-C.pdf
100%|██████████| 1/1 [00:00<00:00,  1.11it/s]

PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [None]:
from BRAD import rag

In [1]:
import subprocess
import os
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import chromadb

In [None]:
def create_database(self, docsFile=None, docsPath='/nfs/turbo/umms-indikar/shared/projects/RAG/papers/', dbName=None, dbPath='/nfs/turbo/umms-indikar/shared/projects/RAG/databases/', HuggingFaceEmbeddingsModel = 'BAAI/bge-base-en-v1.5', chunk_size=[700], chunck_overlap=[200], v=False):
    # Handle arguments
    docsPath += docsFile
    dbPath   += dbName
    
    local = os.getcwd()  ## Get local dir
    os.chdir(local)      ## shift the work dir to local dir
    
    print('\nWork Directory: {}'.format(local)) if v else None

    #%% Phase 1 - Load DB
    embeddings_model = HuggingFaceEmbeddings(model_name=HuggingFaceEmbeddingsModel)
    
    print('\nDocuments loading from:', docsPath) if v else None

    text_loader_kwargs={'autodetect_encoding': True}
    loader = DirectoryLoader(docsPath,
                             glob="**/*.pdf",
                             loader_cls=UnstructuredPDFLoader, 
                             loader_kwargs=text_loader_kwargs,
                             show_progress=True,
                             use_multithreading=True)
    docs_data = loader.load()

    print('\nDocuments loaded...') if v else None
    
    chunk_size = [700] #Chunk size 
    chunk_overlap = [200] #Chunk overlap

    for i in range(len(chunk_size)):
        for j in range(len(chunk_overlap)):
            text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size[i],
                                                            chunk_overlap = chunk_overlap[j],
                                                            separators=[" ", ",", "\n", ". "])
            data_splits = text_splitter.split_documents(docs_data)
            
            print('Documents split into chunks...') if v else None
            print('Initializing Chroma Database...') if v else None

            dbName = "DB_cosine_cSize_%d_cOver_%d" %(chunk_size[i], chunk_overlap[j])

            p2_2 = subprocess.run('mkdir  %s/*'%(dbPath+dbName), shell=True)
            _client_settings = chromadb.PersistentClient(path=(dbPath+dbName))

            vectordb = Chroma.from_documents(documents           = data_splits,
                                             embedding           = embeddings_model,
                                             client              = _client_settings,
                                             collection_name     = dbName,
                                             collection_metadata = {"hnsw:space": "cosine"})

            print('Completed Chroma Database: ', dbName) if v else None
            del vectordb, text_splitter, data_splits

# Connecting Literature Databases to BRAD

## Specifiying the Database

When running `brad.chat()`, there is an option to use a previously saved database. **ADD how to s Type Y to supplement your query with the database.

## Using the RAG system

## Viewing the Documents from BRAD

# Building a L