In [None]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.embeddings import ZhipuAIEmbeddings
from langchain_community.vectorstores import FAISS

from app.config.config import settings

embeddings = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=settings.zhipu_api_key,
)
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from uuid import uuid4

files = ['悟空传.txt', '狂人日记.txt']
documents = []
for file in files:
    with open(file) as f:
        text = f.read()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )
    chunks: List[Document] = text_splitter.create_documents([text])
    # Add metadata and collect documents
    for chunk in chunks:
        chunk.metadata['source'] = file.split('.')[0]
        documents.append(chunk)

uuids = [str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents=documents, ids=uuids)

In [None]:
vector_store.save_local("story")

In [None]:
new_vector_store = FAISS.load_local(
    "story", embeddings, allow_dangerous_deserialization=True
)

In [None]:
results = vector_store.similarity_search(
    "孙悟空为什么不杀了那个妖怪？"     
)
results

In [None]:
results = vector_store.similarity_search(
    "孙悟空为什么不杀了那个妖怪？",
    k=2,
    filter={"source": "悟空传"},
)
results

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    temperature=0,
    model="glm-4",
    openai_api_key=settings.zhipu_api_key,
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

In [None]:
# 方式1
qdocs = "".join([results[i].page_content for i in range(len(results))])
qdocs
response = llm.call_as_llm(f"已知: {qdocs} 问题: 赵贵翁干什么了？列出引用的片段")
response

In [None]:
# 方式2
retriever = vector_store.as_retriever(
    search_kwargs={"k": 2,
                   "filter": {"source": "狂人日记"}},
)

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, verbose=True)
response = qa_chain.run("赵贵翁干什么了？列出引用的片段")
response