# ELS作为矢量数据库的使用

这本笔记本展示了如何使用ELS Vector Search将您的嵌入存储在ELS文档中，创建一个向量搜索索引，并使用近似最近邻居算法（分层可导航小世界）执行KNN搜索。

在笔记本中，我们将演示如何 Retrieval Augmented Generation使用 ELS 、OpenAI 和 Langchain 执行 (RAG)。

In [88]:
from langchain.vectorstores.elasticsearch import ElasticsearchStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

# 创建向量转换器
embedding = OpenAIEmbeddings(
  openai_api_key="",
  openai_api_base=""
)


# 创建大语言模型
model = ChatOpenAI(
  openai_api_key="",
  openai_api_base=""
)




In [103]:
# 创建ELS向量存储器
elastic_vector_search = ElasticsearchStore(
    es_url="http://154.204.60.125:9200",
    index_name="multi_index_1",
    embedding=embedding,
)


In [90]:


from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import CharacterTextSplitter
# 文本加载
loader = UnstructuredMarkdownLoader("txt.md")
documents = loader.load()
# 文档分隔
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

elastic_vector_search.add_documents(docs)





['c7768364-da3f-4f31-a276-0a9ea7b9bc81',
 '7fa560fe-2708-47c5-b6b0-5e5c3f17a761',
 '418e364b-23cd-4cc5-9992-b3430bbdc6ed',
 '30699102-982a-4eff-8567-801f170d8a6d',
 '2ffed417-e887-4fbc-9cf4-722a76589d93',
 '2b7e34c9-6acc-4bfd-9123-8551a40ef40c',
 'f1e55460-64a4-48fa-983d-24c6d74de625',
 '313bbc5a-7f94-43c6-8143-fe25b34f8c0f',
 'a991a903-3e96-4cf1-91f7-f1d3dc0a44ee',
 'bc6a1b8b-d0f4-4bbc-8b09-671a09e8e493',
 '7df0a423-0f50-4727-a0df-3f158eb485c8',
 '5523dfd5-c5e2-45d9-92a2-3f6babca7a6b',
 'a1c0fabd-41fb-497f-af90-871a2c4fad64',
 '82c361fe-6e01-4cb7-a1f5-ed33cc299eb0',
 'cd252fa3-5186-4262-94d9-0f3dfee8bc42',
 'cd73d8d6-646b-4b25-9619-10e779132158',
 'c45ba8a8-6f8d-44b5-bd5c-e93868e5d33b',
 'c5047ec6-e835-4de5-853c-e9e46768bb28',
 '91dc8ea1-f7f8-4890-9c9f-cb56fa201665',
 'cc8b1751-81a5-453c-9fcd-99c959c97b20',
 'c6071e9d-1be5-4807-a0a1-b7465589b258',
 'a6918bb8-5781-4e5a-a9d2-d2a8aee9cf57',
 'a61c000e-8c2d-48b6-8b2f-43e5cb290ff4',
 'd108c097-74d0-4a1d-95c5-befb7e45c778',
 '5726cb6a-98c9-

In [91]:
query = "gpt4free git地址在哪"
results = elastic_vector_search.similarity_search(query)
print(results)

[Document(page_content='This program is licensed under the GNU GPL v3\n\n```\nxtekky/gpt4free: Copyright (C) 2023 xtekky\n\nThis program is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.', metadata={'source': 'txt.md'}), Document(page_content='🔗 Related GPT4Free Projects', metadata={'source': 'txt.md'}), Document(page_content='🔗 Related GPT4Free Projects', metadata={'source': 'txt.md', 'doc_id': '89a5dc89-4db5-42ec-8f74-356964e96558'}), Document(page_content='🔗 Related GPT4Free Projects', metadata={'source': 'txt.md', 'doc_id': 'cf1aec5b-7fd6-45f7-a38c-252e027065a8'})]


In [92]:
# 多查询检索器
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever_from_llm=MultiQueryRetriever.from_llm(
  retriever=elastic_vector_search.as_retriever(),
  llm=model,
)


In [93]:
# 设置查询日志记录
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [94]:
retriever_from_llm.get_relevant_documents(query, top_k=3)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. Where can I find the git address for gpt4free?', '2. What is the location of the git address for gpt4free?', '3. Can you provide me with the git address for gpt4free?']


[Document(page_content='🔗 Related GPT4Free Projects', metadata={'source': 'txt.md'}),
 Document(page_content='🔗 Related GPT4Free Projects', metadata={'source': 'txt.md', 'doc_id': '89a5dc89-4db5-42ec-8f74-356964e96558'}),
 Document(page_content='🔗 Related GPT4Free Projects', metadata={'source': 'txt.md', 'doc_id': 'cf1aec5b-7fd6-45f7-a38c-252e027065a8'}),
 Document(page_content='🔗 Related GPT4Free Projects', metadata={'source': 'txt.md', 'doc_id': 'd215c0b0-b56d-4aa8-807d-33195a223ebc'})]

In [95]:
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import ContextualCompressionRetriever

# 创建压缩器
embeddings_filter = EmbeddingsFilter(
  embeddings=embedding,
  similarity_threshold=0.76,
)

compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter,base_retriever=elastic_vector_search.as_retriever())

compression_retriever.get_relevant_documents(query)





[_DocumentWithState(page_content='This program is licensed under the GNU GPL v3\n\n```\nxtekky/gpt4free: Copyright (C) 2023 xtekky\n\nThis program is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.', metadata={'source': 'txt.md'}, state={'embedded_doc': [0.017862426160816954, -0.007693581369265972, -0.022204993199913457, -0.05312407047828056, -0.011116971100087044, 0.016241200146220918, -0.01028464609259355, -0.009293093083666513, -0.01393240212543461, -0.022986654206950816, 0.03986476435890589, 0.026663361240052526, -0.02111935019013932, -0.0038142214343397625, -0.008547619076954947, 0.016675457150130572, 0.006050643554474463, -0.013512621121655289, 0.006018074054181237, 0.009300331083731678, 0.02368146621320627, 0.04229660338079995, -0.030600622275500023, -0.018050604162511138, 0.03080327627732454, 0.0140554

In [104]:
## MultiVector Retriever

## 文本加载
loader = UnstructuredMarkdownLoader("txt.md")
documents = loader.load()

## 文本分隔器
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

docs = text_splitter.split_documents(documents)
print("docs:",docs)
print("docslength:",len(docs))

from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

# 父文档的存储器
store = InMemoryByteStore()

id_key= "doc_id"

retriever = MultiVectorRetriever(
  vectorstore=elastic_vector_search,
  byte_store=store,
  id_key=id_key,
)

import uuid
doc_ids = [str(uuid.uuid4()) for _ in range(len(docs))]

# 将文本再次分隔最小的快
child_text_splitter = CharacterTextSplitter(chunk_size=200)

sub_docs = []

for i,doc in enumerate(docs):
  _id = doc_ids[i]
  _sub_docs = child_text_splitter.split_documents([doc])
  for _doc in _sub_docs:
    _doc.metadata[id_key] = _id
  sub_docs.extend(_sub_docs)
print("sub_docs:",sub_docs)

retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))


  




docs: [Document(page_content="By using this repository or any code related to it, you agree to the legal notice. The author is not responsible for any copies, forks, re-uploads made by other users, or anything else related to GPT4Free. This is the author's only account and repository. To prevent impersonation or irresponsible actions, please comply with the GNU GPL license this Repository uses.\n\n[!Note]\nLatest pypi version: 0.1.9.2\nsh\npip install -U g4f\n\n🆕 What's New\n\nJoin our Telegram Channel: t.me/g4f_channel\n\nJoin our Discord Group: discord.gg/XfybzPXPH5\n\nExplore the g4f Documentation (unfinished): g4f.mintlify.app | Contribute to the docs via: github.com/xtekky/gpt4free-docs\n\n📚 Table of Contents\n\n🆕 What's New\n\n📚 Table of Contents\n\n🛠️ Getting Started\nPrerequisites:\nSetting up the project:\nInstall using pypi\nor\nSetting up with Docker:\n\n💡 Usage\n\nThe g4f Package\nChatCompletion\nCompletion\nProviders\nCookies Required\nAsync Support\nProxy and Timeout Supp

In [105]:
retriever.vectorstore.similarity_search("项目地址在哪")[0]
retriever.get_relevant_documents(query)

[Document(page_content='Models\n\nModel \n Base Provider \n Provider \n Website \n \n palm \n Google \n g4f.Provider.Bard \n bard.google.com \n \n h2ogpt-gm-oasst1-en-2048-falcon-7b-v3 \n Hugging Face \n g4f.Provider.H2o \n www.h2o.ai \n \n h2ogpt-gm-oasst1-en-2048-falcon-40b-v1 \n Hugging Face \n g4f.Provider.H2o \n www.h2o.ai \n \n h2ogpt-gm-oasst1-en-2048-open-llama-13b \n Hugging Face \n g4f.Provider.H2o \n www.h2o.ai \n \n claude-instant-v1 \n Anthropic \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n claude-v1 \n Anthropic \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n claude-v2 \n Anthropic \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n command-light-nightly \n Cohere \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n command-nightly \n Cohere \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n gpt-neox-20b \n Hugging Face \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n oasst-sft-1-pythia-12b \n Hugging Face \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n oasst-sft-4-pythia-12b-epoch-3.5 \n Hugging

In [130]:
# 最大边际相关性搜索
from langchain.retrievers.multi_vector import SearchType

retriever.search_type = SearchType.similarity

retriever.get_relevant_documents("你知道 gpt4free git地址吗")


[Document(page_content='Models\n\nModel \n Base Provider \n Provider \n Website \n \n palm \n Google \n g4f.Provider.Bard \n bard.google.com \n \n h2ogpt-gm-oasst1-en-2048-falcon-7b-v3 \n Hugging Face \n g4f.Provider.H2o \n www.h2o.ai \n \n h2ogpt-gm-oasst1-en-2048-falcon-40b-v1 \n Hugging Face \n g4f.Provider.H2o \n www.h2o.ai \n \n h2ogpt-gm-oasst1-en-2048-open-llama-13b \n Hugging Face \n g4f.Provider.H2o \n www.h2o.ai \n \n claude-instant-v1 \n Anthropic \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n claude-v1 \n Anthropic \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n claude-v2 \n Anthropic \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n command-light-nightly \n Cohere \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n command-nightly \n Cohere \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n gpt-neox-20b \n Hugging Face \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n oasst-sft-1-pythia-12b \n Hugging Face \n g4f.Provider.Vercel \n sdk.vercel.ai \n \n oasst-sft-4-pythia-12b-epoch-3.5 \n Hugging

In [136]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# llm知识扩展
template = """基于下面的内容回答下面的问题。"

内容:{context}

问题:{question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
  {"context":retriever,"question":RunnablePassthrough()}
  | prompt
  | model
  | StrOutputParser()
)

chain.invoke("你知道 gpt4free  git 下载地址吗？")


'答案: 是的，gpt4free的git下载地址是https://github.com/xtekky/gpt4free.git。'