# Data Base

In [None]:
!pip install chromadb -i https://pypi.tuna.tsinghua.edu.cn/simple

# Prepare Data

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


data_loader_list = [
    PyMuPDFLoader("/workdir/data_base/knowledge_db/pumkin_book/pumpkin_book.pdf"),
]

data = []
for loader in data_loader_list:
    data.extend(loader.load())

chunk_size = 500
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

splitted_data = text_splitter.split_documents(data)

# Build Database

In [None]:
# install Model Scope & ZhipuAI chatglm3-6b requirements
!pip install protobuf cpm_kernels gradio mdtex2html sentencepiece accelerate -i https://mirrors.aliyun.com/pypi/simple/
!pip install modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install sentence_transformers -i https://pypi.tuna.tsinghua.edu.cn/simple

## Download from model scope

In [None]:
from modelscope import snapshot_download
model_dir = snapshot_download(
    "iic/nlp_gte_sentence-embedding_chinese-large",
    revision="v1.1.0",
    cache_dir="/workdir/data_base/llm_models/ModelScope"
)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings


_ = load_dotenv(find_dotenv())

# openai_embedding = OpenAIEmbeddings(
#     api_key=os.environ["OPENAI_SECRET_KEY"],
#     base_url=os.environ["OPENAI_API_BASE"]
# )

# model_name = ""
# model_kwargs = {'device': 'gpu'}
# encode_kwargs = {'normalize_embeddings': True}
# hf_chatglm_embedding = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )
model_dir = "/workdir/data_base/llm_models/ModelScope/iic/nlp_gte_sentence-embedding_chinese-large"

ms_chatglm_embedding = ModelScopeEmbeddings(
    model_id=model_dir, # 
    model_revision="v1.1.0",
)

data_base_dir = "/workdir/data_base/vector_db"

vectordb = Chroma.from_documents(
    documents=splitted_data,
    embedding=ms_chatglm_embedding,
    persist_directory=data_base_dir
)

vectordb.persist()

# Search in Database

## Similarity Search

In [None]:
question = "什么是机器学习"
sim_docs = vectordb.similarity_search(question, k=3)
for i, sim_doc in enumerate(sim_docs):
    print(f"No.{i} similar doc:\n\t{sim_doc.page_content}")

## MMR(Maximum Marginal Relevance) Search

In [None]:
mmr_docs = vectordb.max_marginal_relevance_search(
    question,
    k=3
)
for i, mmr_doc in enumerate(mmr_docs):
    print(f"No.{i} mmr doc:\n\t{mmr_doc.page_content}")

# Build Retrieval QA Chain

In [None]:
from langchain.chains import RetrievalQA
from modelscope import AutoTokenizer, AutoModel, snapshot_download

llm_dir = snapshot_download(
    "ZhipuAI/chatglm3-6b",
    revision="v1.0.2",
    cache_dir="/workdir/data_base/llm_models/ModelScope"
)
tokenizer = AutoTokenizer.from_pretrained(llm_dir)
model = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = model.eval()
