In [1]:
from langchain import hub
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.llms import Ollama
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
import chromadb.utils.embedding_functions as embedding_functions
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import streamlit as st
import os
import time
import warnings
warnings.filterwarnings('ignore')

# Loading documents and spliting them in chunks to store them in vector DB

In [6]:
files = [
    "Data/Neuroscience Science of the Brain.pdf",
    "Data/paper2.pdf",
    "Data/paper3.pdf",
    "Data/physiol-review-1973.pdf",
    "Data/physiol-review-1973.pdf",
    "Data/The-Brain_AUsersGuidePDFDrive.pdf"
]

temp = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
for i in files:
    loader = PyPDFLoader(i)
    data = loader.load()
    all_splits = text_splitter.split_documents(data)
    temp = temp + all_splits

In [8]:
def getEmbeddings():
    model_name = "BAAI/bge-small-en"
    model_kwargs = {"device": "cpu"}
    encode_kwargs = {"normalize_embeddings": True}
    hf = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
    return hf

In [9]:
persist_directory = 'database'

vectorstore = Chroma.from_documents(
    documents=all_splits, embedding=getEmbeddings(),persist_directory=persist_directory)

vectorstore.persist()