# Build a chatbot with its own knowledge base using ChromaDB with Langchain

In this notebook I'll load documents from a directory and create a vector store to build the knowledge base for a chatbot. I'll use the [ChromaDB](https://python.langchain.com/docs/integrations/vectorstores/chroma).

### Functions to load documents from a directory

In [1]:
from langchain.document_loaders import DirectoryLoader
from typing import Union

import os
import glob

In [2]:

#list of allowed file types
ALLOWED_FILES = ['pdf', 'txt', 'docx', 'xml', 'html', 'md', 'json']


The full documentation about directory loadder or file loaders can be found in the [langchain documentation](https://python.langchain.com/docs/integrations/document_loaders)

In [3]:

#function to load a single file, HTML
def load_html(file_path: str):
    '''
        function to load html files form the dir

        params:
            file_path: str: path of the file with its name

        returns:
            Doc object
    '''

    #import HTML loader
    from langchain.document_loaders import UnstructuredHTMLLoader
    #prepare loader
    print(f"Loading HTML file '{file_path}'...")
    loader = UnstructuredHTMLLoader(file_path)
    #load document
    doc = loader.load()
    print(f"Loaded HTML file '{doc.metadata['filename']}'")
    #return document
    return doc
    

In [4]:
#function to load a directory
def load_directory(path:str, file_types: Union[str, list]):
    '''
        function to load documents from a directory
    
        params:
            path: str: path pof the dir to load documents from
            file_types: (str or list of strings): file formats to load e.g. .pdf, .docx etc

        returns:
            Docloader object

    '''

    # convert file_types to a list if it is a string
    if isinstance(file_types, str):
        file_types = [file_types]

    # if file_types is not in the allowed files list, raise an error
    for fl in file_types:
        if fl not in ALLOWED_FILES:
            raise ValueError(f"File type {fl} is not allowed. Allowed file types are: {ALLOWED_FILES}")

    #list to store all loader objects
    docs_list = []
    #for each file type in the file_types parameter
    for fl in file_types:
        #load docs
        print(f"Loading {fl} files...")
        try:
            if fl != 'html':
                loader = DirectoryLoader(
                            path,
                            glob=f"**/*.{fl}",
                            show_progress=True,
                            use_multithreading=True,
                            recursive=True
                        )
                docs = loader.load()
                print(f"Loaded {len(docs)} {fl} files")
                #add into the list of loader objects
                docs_list+=docs
            else:
                html_list = glob.glob(os.path.join(path, '*.html'))
                docs = []
                for ht in html_list:
                    #add into the list of loader objects
                    docs_list+=load_html(ht)
                    # docs.append(load_html(ht))

        except Exception as e:
            print(f"Error loading {fl} files: {e}")
    
    return docs_list
    #return the final loaded list of documents

### A processor function to process the documents

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import document_loaders


def process_docs(docs: document_loaders):
    '''
        function to split text into sentences and words

        parameters:
            docs: list of document objects

        returns:
            texts: list of text objects
    '''

    #load text splitter
    text_splitter = RecursiveCharacterTextSplitter()
    #split documents
    print('Splitting documents...')
    texts = text_splitter.split_documents(docs)

    return texts




### Saving documents to a vector store

In [6]:
#  setup sqlite3 for linux based OS, for windows it's not needed, comment these lines for windows
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [7]:
# # get chroma db max batch size
from chromadb import Client
chromadb_client = Client()
chroma_batch_size = chromadb_client.max_batch_size
print(f"Chroma DB max batch size: {chroma_batch_size}")

Chroma DB max batch size: 41666


In [12]:
from langchain.vectorstores import Chroma
from langchain import embeddings, text_splitter

from typing import Optional

import time

def store_db(texts: text_splitter, embeddings: embeddings, persist_directory: Optional[str], persist: Optional[bool]):
    '''
        function to store the database

        parameters:
            texts: list of text objects
            embeddings: embeddings to apply
            persist_directory: directory to save the database
            persist: whether to persist the database or not

        returns:
            vector_store: vector store object
    '''

    vector_db = None
    def store(text):

        #if persist is true and persist_directory is not None or persist is false and persist_directory is not None
        if (persist and persist_directory is not None) or (not persist and persist_directory is not None):
            #store texts into directory
            print(f"Storing database in directory '{persist_directory}'...")

            vector_db = vector_store.from_documents(
                                            documents=text, 
                                            embedding=embeddings,
                                            persist_directory=persist_directory
                                        )
            
        #else if persist is true and persist_directory is None
        elif persist and persist_directory is None:
            #store texts into memory
            raise Exception("Persist directory must be specified to persist the database")
        #else if persist is false and persist_directory is None
        elif not persist and persist_directory is None:
            print("Storing database in memory...")
            #store texts into memory
            vector_db = vector_store.from_documents(documents=texts, 
                                    embedding=embeddings)
            
        return vector_db


    print('total docs to be stored: ', len(texts))

    #load vector store
    vector_store = Chroma()

    #if the number of texts is greater than the chroma db max batch size
    if len(texts) > chroma_batch_size:
        #split texts into batches
        texts = [texts[i:i+chroma_batch_size] for i in range(0, len(texts), chroma_batch_size)]
        print('total batches: ', len(texts))

        #for each batch in texts
        # store batch
        # keep count of the number of batches
        # after every 3rd batch sleep for 1.5 minutes
        # this is to prevent the chroma db from crashing
        for i, batch in enumerate(texts):
            #store batch
            vector_db = store(batch)
            time.sleep(1.5)
            if i % 3 == 0:
                print('sleeping for 90 seconds...')
                time.sleep(90)
    else:
        #store texts
        vector_db = store(texts)

    
    if vector_db is None:
        raise Exception('Database could not be stored')
    
    return vector_db


### The function to ingest documents from a directory

In [13]:
#import some accessory modules from langchain
from dotenv import load_dotenv, find_dotenv
from langchain.embeddings import OpenAIEmbeddings

import argparse
from typing import Union, Optional
import os

In [14]:
# load the .env file
load_dotenv(find_dotenv())
open_ai_api_key = os.getenv('OPENAI_API_KEY')

In [15]:

def ingest(
            path: str,
            file_types: str = 'pdf, docx, html, txt, xml, md, json',
            embeddings=OpenAIEmbeddings(
                                        openai_api_key=open_ai_api_key,
                                        show_progress_bar=True
                                    ),
            persist_directory: Optional[str]='db',
            persist: Optional[bool]=True
        ):
    '''
        function to ingest documents

        parameters:
            path: str: path of the dir to load documents from
            file_types: str: file formats to load e.g. 'pdf, docx, md, txt'  etc
            embeddings: embeddings to apply
            persist_directory: directory to save the database
            persist: whether to persist the database or not

        returns:
            vector_store: vector store object
    '''

    file_types = file_types.split(',')

    file_types = [fl.strip() for fl in file_types if fl.strip() != '']

    # print(file_types)
    print('File types: ', file_types)

    print('Loading documents from directory...', path)

    #load documents
    docs = load_directory(path, file_types)
    print(f'Loaded {len(docs)} documents from directory: ', path, ' Successfully')
    #process documents
    texts = process_docs(docs)
    #store database
    vector_store = store_db(texts, embeddings, persist_directory, persist)

    return vector_store


#### Let's test the directory ingestion

The db_files dir contains a book related to bioinformatics and an HTML file of bioinformatics wikipedia. Let's ingest that and then we'll query the vector store.

In [17]:
store = ingest(
                'db_files',
                file_types='html, txt, pdf, docx, md, xml, json',
                embeddings=OpenAIEmbeddings(
                                        openai_api_key=open_ai_api_key,
                                        show_progress_bar=True
                                    ),
                persist_directory='db2',
                persist=True
            )

File types:  ['html', 'txt', 'pdf', 'docx', 'md', 'xml', 'json']
Loading documents from directory... db_files
Loading html files...
Loading HTML file 'db_files/Bioinformatics - Wikipedia.html'...
Error loading html files: 'list' object has no attribute 'metadata'
Loading txt files...


0it [00:00, ?it/s]


Loaded 0 txt files
Loading pdf files...


100%|██████████| 1/1 [00:00<00:00, 119.40it/s]


Loaded 0 pdf files
Loading docx files...


0it [00:00, ?it/s]


Loaded 0 docx files
Loading md files...


0it [00:00, ?it/s]


Loaded 0 md files
Loading xml files...


0it [00:00, ?it/s]


Loaded 0 xml files
Loading json files...


0it [00:00, ?it/s]

Loaded 0 json files
Loaded 0 documents from directory:  db_files  Successfully
Splitting documents...
total docs to be stored:  0
Storing database in directory 'db2'...





AttributeError: 'list' object has no attribute 'page_content'

**Can use the following method in a .py script**

It can also be run with in scheduled to automate directory loading and ingestion of documents.

In [None]:

# def main():
#     parser = argparse.ArgumentParser(description='Ingest documents into a vector store')
#     parser.add_argument('path', type=str, help='path of the dir to load documents from')
#     parser.add_argument('file_types', type=str, help='file formats to load e.g. .pdf, .docx etc')
#     parser.add_argument('persist_directory', type=str, help='directory to save the database')
#     parser.add_argument('persist', type=bool, help='whether to persist the database or not')
#     args = parser.parse_args()

#     #embeddings
#     embeddings = OpenAIEmbeddings(openai_api_key=open_ai_api_key )

#     #ingest documents
#     store = ingest(args.path, args.file_types.strip(), embeddings, args.persist_directory, args.persist)

#     print('Database stored successfully', store.get()['metadatas'][0]['source'])

#     print('Exiting...')


# if __name__ == '__main__':
#     main()
