In [13]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.embeddings import ZhipuAIEmbeddings
from langchain_community.vectorstores import FAISS

from app.config.config import settings

embeddings = ZhipuAIEmbeddings(
    model="embedding-2",
    api_key=settings.zhipu_api_key,
)
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [19]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]
print(len(uuids))
print(len(documents))
documents
# vector_store.add_documents(documents=documents, ids=uuids)

10
10


[Document(metadata={'source': 'tweet'}, page_content='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(metadata={'source': 'website'}, page_content='Is the new iPhone worth the price? Read this review to find out.'),
 Document(metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.'),
 Document(metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic application

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load example document
with open("悟空传.txt") as f:
    state_of_the_union = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([state_of_the_union])

In [10]:
from langchain.document_loaders import TextLoader
files = ['悟空传.txt', '狂人日记.txt']
documents = []
for file in files:
    loader = TextLoader(file_path=file)
    docs = loader.load()
    for doc in docs:
        doc.metadata['source'] = file.split('.')[0]
    documents.extend(docs)
uuids = [str(uuid4()) for _ in range(len(documents))]

In [11]:
vector_store.add_documents(documents=documents, ids=uuids)

['abd6a378-53d2-4f42-b734-ee6db1b014a8',
 'c40dd62f-5a1e-4aa3-b660-1452a182f3cc']

In [12]:

results = vector_store.similarity_search(
    "孙悟空为什么不杀了那个妖怪？",
    k=2,
    filter={"source": "悟空传"},
)
for res in results:
    print(res)

page_content='《悟空传》
 第一章

    四个人走到这里，前边一片密林，又没有路了。

    “悟空，我饿了，找些吃的来。”唐僧往石头上大模大样一坐，说道。

    “我正忙着，你不会自己去找？……又不是没有腿。”孙悟空拄着棒子说。

    “你忙？忙什么？”

    “你不觉得这晚霞很美吗？”孙悟空说，眼睛还望着天边，“我只有看看这个，才能每天坚持向西走下去啊。”

    “你可以一边看一边找啊，只要不撞到大树上就行。”

    “我看晚霞的时候不做任何事！”

    “孙悟空你不能这样，不能这样欺负秃头，你把他饿死了，我们就找不到西天，找不到西天，我们身上的诅咒永远也解除不了。”猪八戒说。

    “呸！什么时候轮到你这个猪头说话了！”

    “你说什么？你说谁是猪？！”

    “不是猪，是猪头！哼哼哼……”孙悟空咬着牙冷笑。

    “你敢再说一遍！”猪八戒举着钉耙就要往上冲。

    “吵什么吵什么！老子要困觉了！要打滚远些打！”沙和尚大吼。

    三个恶棍怒目而视。

    “打吧打吧，打死一个少一个。”唐僧站起身来，“你们是大爷，我去给你们找吃的，还不行吗？最好让妖怪吃了我，那时你们就哭吧。”

    “快去吧，那儿有女妖精正等着你呢”孙悟空叫道。

    “哼哼哼哼”三个怪物都冷笑。

    “别以为我离了你们就不行！”唐僧回头冲他们挥挥拳头，拍拍身上的尘土，又整整长袍，开始向林中走去。刚迈一步，“嘶啦”长衫就挂破了。

    “哈哈哈哈……”三个家伙笑成一团，也忘了打架。

    这是一片紫色的从林，到处长着奇怪的植物和飘着终年不散的青色雾气，越往里走，脚下就越潮湿，头上就越昏暗，最后枝叶完全遮蔽了天空，唐僧也完全迷路了。

    “好啊，这么多的生机，这么多不同的生命！”唐僧高兴的说。

    “谢谢！”有个声音回答他。

    唐僧一回头，看见一颗会说话的树，紫黑色树干上有两只一眨一眨的眼睛。

    “真是惊奇，我看见了一个妖怪，我喜欢能超越常理的东西，生命果然是很奇妙的事啊，让我摸摸你，土里的精灵。”唐僧伸出手去，欣喜的抚摸着树干。

    那树干上泌满紫色的汁液，摸上去湿滑无比。

    树很惬意的接受着抚摸，它的几万下垂的分枝都不禁舒畅的摇动起来。

    “呵，