# Data Base

In [None]:
!pip install chromadb -i https://pypi.tuna.tsinghua.edu.cn/simple

# Prepare Data

In [3]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


data_loader_list = [
    PyMuPDFLoader("/workdir/data_base/knowledge_db/pumkin_book/pumpkin_book.pdf"),
]

data = []
for loader in data_loader_list:
    data.extend(loader.load())

chunk_size = 500
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

splitted_data = text_splitter.split_documents(data)

# Build Database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings


_ = load_dotenv(find_dotenv())

openai_embedding = OpenAIEmbeddings(
    api_key=os.environ["OPENAI_SECRET_KEY"],
    base_url=os.environ["OPENAI_API_BASE"]
)
# TODO:
hf_embedding = HuggingFaceBgeEmbeddings()

data_base_dir = "/workdir/data_base/vector_db"

vectordb = Chroma.from_documents(
    documents=splitted_data,
    embedding=openai_embedding,
    persist_directory=data_base_dir
)

vectordb.persist()

# Search in Database

## Similarity Search

## MMR(Maximum Marginal Relevance) Search

# Build Retrieval QA Chain

In [None]:
from langchain.chains import RetrievalQA