# Data Base

In [None]:
!pip install chromadb -i https://pypi.tuna.tsinghua.edu.cn/simple

# Prepare Data

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


data_loader_list = [
    PyMuPDFLoader("/workdir/data_base/knowledge_db/pumkin_book/pumpkin_book.pdf"),
]

data = []
for loader in data_loader_list:
    data.extend(loader.load())

chunk_size = 500
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

splitted_data = text_splitter.split_documents(data)

# Build Database

In [None]:
# install Model Scope & ZhipuAI chatglm3-6b requirements
!pip install protobuf cpm_kernels gradio mdtex2html sentencepiece accelerate -i https://mirrors.aliyun.com/pypi/simple/
!pip install modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install sentence_transformers -i https://pypi.tuna.tsinghua.edu.cn/simple

## Download from model scope

In [None]:
from modelscope import snapshot_download
embd_model_dir = snapshot_download(
    "iic/nlp_gte_sentence-embedding_chinese-large",
    revision="v1.1.0",
    cache_dir="/workdir/data_base/llm_models/ModelScope",
    local_files_only=True
)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings


_ = load_dotenv(find_dotenv())

# openai_embedding = OpenAIEmbeddings(
#     api_key=os.environ["OPENAI_SECRET_KEY"],
#     base_url=os.environ["OPENAI_API_BASE"]
# )

# model_name = ""
# model_kwargs = {'device': 'gpu'}
# encode_kwargs = {'normalize_embeddings': True}
# hf_chatglm_embedding = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )


# embd_model_dir = "/workdir/data_base/llm_models/ModelScope/iic/nlp_gte_sentence-embedding_chinese-large"
ms_chatglm_embedding = ModelScopeEmbeddings(
    model_id=embd_model_dir, # 
    model_revision="v1.1.0",
)

data_base_dir = "/workdir/data_base/vector_db"

vectordb = Chroma.from_documents(
    documents=splitted_data,
    embedding=ms_chatglm_embedding,
    persist_directory=data_base_dir
)

vectordb.persist()

# Search in Database

## Load vectordb

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings

embd_model_dir = "/workdir/data_base/llm_models/ModelScope/iic/nlp_gte_sentence-embedding_chinese-large"
ms_chatglm_embedding = ModelScopeEmbeddings(
    model_id=embd_model_dir, # 
    model_revision="v1.1.0",
)

data_base_dir = "/workdir/data_base/vector_db"
vectordb = Chroma(
    persist_directory=data_base_dir,
    embedding_function=ms_chatglm_embedding
)

## Similarity Search

In [None]:
question = "什么是机器学习"
sim_docs = vectordb.similarity_search(question, k=3)
for i, sim_doc in enumerate(sim_docs):
    print(f"No.{i} similar doc:\n\t{sim_doc.page_content}")

## MMR(Maximum Marginal Relevance) Search

In [None]:
mmr_docs = vectordb.max_marginal_relevance_search(
    question,
    k=3
)
for i, mmr_doc in enumerate(mmr_docs):
    print(f"No.{i} mmr doc:\n\t{mmr_doc.page_content}")

# Build Retrieval QA Chain

## Direct Ask

In [None]:
from modelscope import AutoTokenizer, AutoModel, snapshot_download, Model
llm_dir = snapshot_download(
    "ZhipuAI/chatglm2-6b-int4", # "ZhipuAI/chatglm3-6b"
    revision="v1.0.2",
    cache_dir="/workdir/data_base/llm_models/ModelScope"
)
tokenizer = AutoTokenizer.from_pretrained(
    llm_dir,
    local_files_only=True,
    trust_remote_code=True
)
# chat_model = Model.from_pretrained(
#     "/workdir/data_base/llm_models/ModelScope/ZhipuAI/chatglm2-6b",
#     revision="v1.0.12",
#     local_files_only=True,
# )

chat_model = AutoModel.from_pretrained(
    llm_dir,
    local_files_only=True,
    trust_remote_code=True
).half().cuda()
chat_model = chat_model.eval()

In [5]:
from modelscope import Tasks, pipeline
from langchain.llms import HuggingFacePipeline


pipe = pipeline(
    Tasks.chat, # , chat
    model=chat_model,
    preprocessor=tokenizer,
    sequence_length=300
)
llm = HuggingFacePipeline(pipeline=pipe)
# pipe({"text": "本知识库主要包含什么内容", "history": []})

## New

In [None]:
import sys
sys.path.append("../llm")
from ChatGLM3 import ChatGLM3

llm = ChatGLM3()
llm.load_model(
    "/workdir/data_base/llm_models/ModelScope/ZhipuAI/chatglm3-6b"
)


In [7]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [None]:
question = "本知识库主要包含什么内容"
result = qa_chain({"query": question})
print(f"LLM answer is:\n{result.get('result')}")