# Build a knowledge base using Pinecone with Langchain

In this notebook I'll load documents from a directory and create a vector store to build the knowledge base for a chatbot. I'll use the [Langchain Pinecone](https://python.langchain.com/docs/integrations/vectorstores/pinecone), [Pinecone official](https://www.pinecone.io/).

### Functions to load documents from a directory

In [1]:
from langchain.document_loaders import DirectoryLoader
from typing import Union

import os
import glob

In [2]:

#list of allowed file types
ALLOWED_FILES = ['pdf', 'txt', 'docx', 'xml', 'html', 'md', 'json']


The full documentation about directory loadder or file loaders can be found in the [langchain documentation](https://python.langchain.com/docs/integrations/document_loaders)

In [3]:

#function to load a single file, HTML
def load_html(file_path: str):
    '''
        function to load html files form the dir

        params:
            file_path: str: path of the file with its name

        returns:
            Doc object
    '''

    #import HTML loader
    from langchain.document_loaders import UnstructuredHTMLLoader
    #prepare loader
    print(f"Loading HTML file '{file_path}'...")
    loader = UnstructuredHTMLLoader(file_path)
    #load document
    doc = loader.load()
    print(f"Loaded HTML file '{doc.metadata['filename']}'")
    #return document
    return doc
    

In [4]:
#function to load a directory
def load_directory(path:str, file_types: Union[str, list]):
    '''
        function to load documents from a directory
    
        params:
            path: str: path pof the dir to load documents from
            file_types: (str or list of strings): file formats to load e.g. .pdf, .docx etc

        returns:
            Docloader object

    '''

    # convert file_types to a list if it is a string
    if isinstance(file_types, str):
        file_types = [file_types]

    # if file_types is not in the allowed files list, raise an error
    for fl in file_types:
        if fl not in ALLOWED_FILES:
            raise ValueError(f"File type {fl} is not allowed. Allowed file types are: {ALLOWED_FILES}")

    #list to store all loader objects
    docs_list = []
    #for each file type in the file_types parameter
    for fl in file_types:
        #load docs
        print(f"Loading {fl} files...")
        try:
            if fl != 'html':
                loader = DirectoryLoader(
                            path,
                            glob=f"**/*.{fl}",
                            show_progress=True,
                            use_multithreading=True,
                            recursive=True
                        )
                docs = loader.load()
                print(f"Loaded {len(docs)} {fl} files")
                #add into the list of loader objects
                docs_list+=docs
            else:
                html_list = glob.glob(os.path.join(path, '*.html'))
                docs = []
                for ht in html_list:
                    #add into the list of loader objects
                    docs_list+=load_html(ht)
                    # docs.append(load_html(ht))

        except Exception as e:
            print(f"Error loading {fl} files: {e}")
    
    return docs_list
    #return the final loaded list of documents

### A processor function to process the documents

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import document_loaders


def process_docs(docs: document_loaders):
    '''
        function to split text into sentences and words

        parameters:
            docs: list of document objects

        returns:
            texts: list of text objects
    '''

    #load text splitter
    text_splitter = RecursiveCharacterTextSplitter()
    #split documents
    print('Splitting documents...')
    texts = text_splitter.split_documents(docs)

    return texts




### Saving documents to a vector store

In [6]:
#  setup sqlite3 for linux based OS, for windows it's not needed, comment these lines for windows
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [7]:
from dotenv import load_dotenv, find_dotenv
# load the .env file
load_dotenv(find_dotenv())
open_ai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api = os.getenv('PINECONE_API_KEY')
pinecone_env = os.getenv('PINECONE_ENV')

### setting up pinecone

In [9]:
import pinecone

  from tqdm.autonotebook import tqdm


In [10]:
# initialize pinecone
pinecone.init(
    api_key=pinecone_api,
    environment=pinecone_env
)

#### Create an database aka Index

In [11]:
index_name = "my-knowledgebase"

# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
      name=index_name,
      metric='cosine',
      dimension=1536  # based on the model
)

In [13]:
# list indexes to see if it exists

pinecone.list_indexes()

['my-knowledgebase']

In [16]:
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import embeddings, text_splitter


In [17]:
from typing import Optional

import time

def store_db(texts: text_splitter, embeddings: embeddings, index_name: str):
    '''
        function to store the database

        parameters:
            texts: list of text objects
            embeddings: embeddings to apply
            persist_directory: directory to save the database
            persist: whether to persist the database or not

        returns:
            vector_store: vector store object
    '''

    # First, check if our index already exists. If it doesn't, we create it
    if index_name not in pinecone.list_indexes():
        # we create a new index
        pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # based on the model
    )

    vector_db = None

    vector_db = Pinecone.from_documents(texts, embeddings, index_name=index_name)
    

    print('total docs to be stored: ', len(texts))

    if vector_db is None:
        raise Exception('Database could not be stored')
    
    return vector_db


In [18]:
embeddings = OpenAIEmbeddings(openai_api_key=open_ai_api_key)

### The function to ingest documents from a directory

In [19]:
import argparse
from typing import Union, Optional
import os

In [22]:

def ingest(
            path: str,
            file_types: str = 'pdf, docx, html, txt, xml, md, json',
            embeddings=OpenAIEmbeddings(
                                        openai_api_key=open_ai_api_key,
                                        show_progress_bar=True
                                    )
        ):
    '''
        function to ingest documents

        parameters:
            path: str: path of the dir to load documents from
            file_types: str: file formats to load e.g. 'pdf, docx, md, txt'  etc
            embeddings: embeddings to apply
            persist_directory: directory to save the database
            persist: whether to persist the database or not

        returns:
            vector_store: vector store object
    '''

    file_types = file_types.split(',')

    file_types = [fl.strip() for fl in file_types if fl.strip() != '']

    # print(file_types)
    print('File types: ', file_types)

    print('Loading documents from directory...', path)

    #load documents
    docs = load_directory(path, file_types)
    print(f'Loaded {len(docs)} documents from directory: ', path, ' Successfully')
    #process documents
    texts = process_docs(docs)
    #store database
    vector_store = store_db(texts, embeddings, index_name=index_name)

    return vector_store


#### Let's test the directory ingestion

The db_files dir contains a book related to bioinformatics and an HTML file of bioinformatics wikipedia. Let's ingest that and then we'll query the vector store.

In [23]:
store = ingest(
                'db_files',
                file_types='html, txt, pdf, docx, md, xml, json',
                embeddings=OpenAIEmbeddings(
                                        openai_api_key=open_ai_api_key,
                                        show_progress_bar=True
                                    ),
            )

File types:  ['html', 'txt', 'pdf', 'docx', 'md', 'xml', 'json']
Loading documents from directory... db_files
Loading html files...
Loading HTML file 'db_files/Bioinformatics - Wikipedia.html'...
Error loading html files: 'list' object has no attribute 'metadata'
Loading txt files...


0it [00:00, ?it/s]


Loaded 0 txt files
Loading pdf files...


100%|██████████| 1/1 [01:13<00:00, 73.77s/it]


Loaded 1 pdf files
Loading docx files...


0it [00:00, ?it/s]


Loaded 0 docx files
Loading md files...


0it [00:00, ?it/s]


Loaded 0 md files
Loading xml files...


0it [00:00, ?it/s]


Loaded 0 xml files
Loading json files...


0it [00:00, ?it/s]


Loaded 0 json files
Loaded 1 documents from directory:  db_files  Successfully
Splitting documents...


100%|██████████| 1/1 [00:08<00:00,  8.46s/it]


total docs to be stored:  186


In [30]:
store

<langchain.vectorstores.pinecone.Pinecone at 0x7f4c044abf70>

**Can use the following method in a .py script**

It can also be run with in scheduled to automate directory loading and ingestion of documents.

In [12]:

# def main():
#     parser = argparse.ArgumentParser(description='Ingest documents into a vector store')
#     parser.add_argument('path', type=str, help='path of the dir to load documents from')
#     parser.add_argument('file_types', type=str, help='file formats to load e.g. .pdf, .docx etc')
#     parser.add_argument('index_name', type=str, help='name of the pinecone index to interact with')
#     args = parser.parse_args()

#     #embeddings
#     embeddings = OpenAIEmbeddings(openai_api_key=open_ai_api_key )

#     #ingest documents
#     store = ingest(args.path, args.file_types.strip(), embeddings, index_name=args.index_name.strip())

#     print('Database stored successfully', store.get()['metadatas'][0]['source'])

#     print('Exiting...')


# if __name__ == '__main__':
#     main()
