In [1]:
from langchain_community.chat_models import ChatZhipuAI

In [2]:
ZHIPUAI_API_KEY = "4b94193a18edf745cf3b66a95cb832d2.7Ekj18uG92tH4ePb"

In [3]:
llm = ChatZhipuAI(
    model="glm-3-turbo",
    api_key=ZHIPUAI_API_KEY,
)

In [4]:
print(llm.invoke("你好！"))

content='你好👋！我是人工智能助手智谱清言（ChatGLM），很高兴见到你，欢迎问我任何问题。' response_metadata={'finish_reason': 'stop', 'index': 0}


In [5]:
import chromadb

In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
# from zhipuai_embedding import ZhipuAIEmbeddings
import zhipuai

In [28]:
loader = PyPDFLoader("eBPF.pdf")
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = splitter.split_documents(documents)

In [27]:
"""
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 加载PDF文件
loader = PyPDFLoader("eBPF.pdf")
documents = loader.load()

# 使用RecursiveCharacterTextSplitter分割文档
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = splitter.split_documents(documents)

# 确保texts中只包含字符串
texts = [text.page_content for text in texts]

print(texts)

# 输出结果
for text in texts:
    print(text)
    print("----------")  # 文本块分隔线
"""

'\nfrom langchain.document_loaders import PyPDFLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\n\n# 加载PDF文件\nloader = PyPDFLoader("eBPF.pdf")\ndocuments = loader.load()\n\n# 使用RecursiveCharacterTextSplitter分割文档\nsplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\ntexts = splitter.split_documents(documents)\n\n# 确保texts中只包含字符串\ntexts = [text.page_content for text in texts]\n\nprint(texts)\n\n# 输出结果\nfor text in texts:\n    print(text)\n    print("----------")  # 文本块分隔线\n'

In [11]:
import logging
import os
from typing import List, Optional, Union, Literal, Set, Sequence, Any

import numpy as np
from langchain.embeddings.base import Embeddings
from pydantic import BaseModel
import zhipuai

logger = logging.getLogger(__name__)


class ZhipuAIEmbeddings(BaseModel, Embeddings):
    """
    zhipuai 向量化
    """
    api_key: Optional[str] = None

    def embed_documents(
            self, texts: List[str], chunk_size: Optional[int] = 0
    ) -> List[List[float]]:
        embeddings = []
        for text in texts:

            for retry in range(3):
                # 最多重试3次
                response = zhipuai.model_api.invoke(
                    model="text_embedding",
                    prompt=text
                )
                if response is not None and 'code' in response and response["code"] == 200:
                    data = response["data"]
                    embeddings.append(data.get('embedding'))
                    break  # 成功则跳出重试循环
                else:
                    print(
                        f"Retrying: zhipu embeddings Request failed with response {response}. Retrying..., text= {text}")
                    if retry == 2:
                        print("attention!error,call zhipu embedding fail 3 times,text=", text, ';;; response=',
                              response)
        return [list(map(float, e)) for e in embeddings]

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]

In [12]:
embeddings = ZhipuAIEmbeddings(zhipuai_api_key=ZHIPUAI_API_KEY)

In [13]:
from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [14]:
embeddings = default_ef(texts)

In [23]:
embeddings

[[-0.03535044938325882,
  -0.026744157075881958,
  -0.004917468875646591,
  -0.02066902257502079,
  0.08047736436128616,
  0.025116007775068283,
  0.012537592090666294,
  -0.0006349729956127703,
  -0.13707153499126434,
  -0.015102282166481018,
  0.02363971434533596,
  -0.03718534857034683,
  0.018773261457681656,
  -0.09785469621419907,
  -0.035791460424661636,
  0.01632484421133995,
  0.0673767626285553,
  -0.020882517099380493,
  0.06001536548137665,
  -0.046327464282512665,
  -0.04868538677692413,
  0.01266466174274683,
  -0.05416809767484665,
  -0.016746891662478447,
  0.009515886195003986,
  0.01494204718619585,
  0.028708305209875107,
  -0.018380433320999146,
  0.011423133313655853,
  -0.09331679344177246,
  0.022779563441872597,
  0.0064666736871004105,
  0.0013144811382517219,
  0.006804754491895437,
  -0.06079910695552826,
  0.027625899761915207,
  0.03247741609811783,
  -0.06041150167584419,
  0.001654198276810348,
  -0.07744220644235611,
  -0.06305710971355438,
  -0.02481181

In [16]:
client = chromadb.Client()

In [17]:
bpf_collection0 = client.get_or_create_collection(name="bpf")

In [18]:
bpf_collection0.add(
    embeddings=embeddings,  # 文本列表
    documents=texts,
    metadatas=[{"source": "pdf_extraction"} for _ in texts],  # 元数据列表
    ids=[f"doc{i}" for i in range(len(texts))],  # 文档ID列表
)

In [19]:
results = bpf_collection0.query(
    query_texts=["What is the bpf?"],
    n_results=2
)

In [20]:
results

{'ids': [['doc236', 'doc14']],
 'distances': [[0.7042558789253235, 0.8043650388717651]],
 'metadatas': [[{'source': 'pdf_extraction'}, {'source': 'pdf_extraction'}]],
 'embeddings': None,
 'documents': [['74 | Chapter 4: The bpf() System Call',
   'BPF to BPF Calls                                                                                                              54\nSummary                                                                                                                           56\nExercises                                                                                                                            56\n4.The bpf() System Call. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .  59\nLoading BTF Data                                                                                                            63\nCreating Maps                                                                       

In [21]:
from langchain.embeddings import SentenceTransformerEmbeddings

In [24]:
embedding = SentenceTransformerEmbeddings()

In [25]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [29]:
vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory="db")

In [31]:
vectordb.persist()
vectordb = None

In [30]:
from langchain.chains import VectorDBQA

In [33]:
vectordb = Chroma(persist_directory="db", embedding_function=embedding)

In [35]:
qa = VectorDBQA.from_chain_type(llm,  chain_type="stuff", vectorstore=vectordb)



In [38]:
query = "什么是bpf？"

In [39]:
qa.run(query)

'BPF（Berkeley Packet Filter）是一种在Unix类操作系统中用于网络数据包过滤的机制。它最初由UC Berkeley开发，用于增强网络监控工具的性能。后来，BPF这一概念被Linux内核采用，并得到了广泛的支持和发展。\n\nBPF允许网络接口在混杂模式下工作，这意味着它能够捕获流经网络接口的所有数据包，而不仅仅是那些目的地为本地主机的数据包。此外，BPF还提供了一种机制，允许用户空间程序提供过滤规则，告诉内核只把感兴趣的数据包传递给用户空间的应用程序，这样就可以避免不必要的数据包处理，提高系统性能。\n\n随着时间的推移，BPF的功能得到了扩展，不仅限于网络数据包过滤，还扩展到了内核和用户空间事件监控、性能分析、安全监控等多个领域。现在的BPF已经成为一个功能强大的程序执行引擎，允许用户在内核中运行自定义的程序代码，执行各种任务。\n\nBPF程序通常由内核执行，它们运行在内核空间，并且受到内核严格的权限控制和安全检查。这使得BPF即使运行自定义代码也非常安全，因为它们不能直接访问用户的文件或其他资源，也不能干扰系统的稳定运行。\n\n此外，BPF还支持类型格式（BTF），这是一种元数据格式，用于编码与BPF程序和映射（maps）相关的调试信息，比如数据类型、函数签名和行信息等。BTF有助于改善BPF程序的调试和性能分析。\n\n总之，BPF是一种高效且功能强大的机制，允许用户在内核级别进行精细的操作控制和数据分析，而不需要修改内核代码或loadable modules。它广泛应用于网络监控、性能调优、安全审计等多个领域。'