### Embed_documents

In [4]:
from langchain.embeddings import DashScopeEmbeddings   # 使用阿里的 DashScopeEmbeddings

e_model = DashScopeEmbeddings ()

embeddings = e_model.embed_documents(
    [
        "The quick brown fox jumps over the lazy dog.",
        "The rain in Spain stays mainly in the plain.",
        "In the middle of difficulty lies opportunity.",
        "To be or not to be, that is the question.",
        "All that glitters is not gold.",
        "A journey of a thousand miles begins with a single step.",
        "The only thing we have to fear is fear itself.",
        "The pen is mightier than the sword.",
        "A picture is worth a thousand words.",
    ]
)

len(embeddings)

embeddings

[[3.2923457622528076,
  0.9054917693138123,
  -1.9331774711608887,
  -2.419536828994751,
  0.9491921663284302,
  -1.6599777936935425,
  -0.7527825832366943,
  -1.4724750518798828,
  1.0797384977340698,
  0.008359761908650398,
  -2.157008171081543,
  -1.140151023864746,
  -0.25641340017318726,
  0.7352514266967773,
  3.1682066917419434,
  -1.2807042598724365,
  2.1359519958496094,
  1.5472067594528198,
  -0.35179373621940613,
  2.5363519191741943,
  1.7707394361495972,
  -2.970102071762085,
  3.3929224014282227,
  2.6335442066192627,
  -2.9144835472106934,
  0.04718486964702606,
  -0.4858641028404236,
  -2.7417335510253906,
  -3.031888484954834,
  1.1598284244537354,
  3.198843240737915,
  -1.1506097316741943,
  -1.5548189878463745,
  0.37152099609375,
  0.3215879797935486,
  3.4725897312164307,
  -2.5707576274871826,
  -0.049935366958379745,
  -1.1843512058258057,
  -0.7219778299331665,
  -0.9821010231971741,
  -0.4001457095146179,
  1.8440380096435547,
  1.6771913766860962,
  -4.63066

### Embed_query

In [5]:
embedded_query = e_model.embed_query("这段话中提到了什么名词")
embedded_query

[-0.6958211064338684,
 -0.7356499433517456,
 1.8689507246017456,
 1.9249131679534912,
 3.1011013984680176,
 -0.11026594042778015,
 -2.7991366386413574,
 2.050485372543335,
 -1.0729844570159912,
 0.8924492597579956,
 -0.8721482753753662,
 -0.718597412109375,
 -0.3935953676700592,
 -0.0167236328125,
 -1.7421468496322632,
 -1.4745992422103882,
 -0.00026448568678461015,
 1.1113553047180176,
 -1.5558065176010132,
 0.2763502299785614,
 -0.0677218958735466,
 0.09993192553520203,
 0.5563185214996338,
 -0.5236680507659912,
 0.6965671181678772,
 -0.4310663044452667,
 1.1611666679382324,
 0.0128648541867733,
 0.1929931640625,
 -1.0591294765472412,
 -2.347193479537964,
 -2.983184814453125,
 1.1365560293197632,
 -1.0435316562652588,
 -1.8030564785003662,
 -1.0512356758117676,
 -0.9225074052810669,
 2.8464457988739014,
 0.8695186972618103,
 -1.3748915195465088,
 -1.727700114250183,
 -0.7726033329963684,
 3.483778238296509,
 -0.2199571430683136,
 2.1576640605926514,
 -0.9940422773361206,
 -0.59849762

### 嵌入向量缓存

In [6]:
from langchain.embeddings import DashScopeEmbeddings,CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

u_embeddings = DashScopeEmbeddings()
cache = LocalFileStore("./cache")
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    u_embeddings,
    cache,
    namespace=u_embeddings.model,
)

list(cache.yield_keys())

[]

In [11]:
# 加载文档，切分文档，将切分文档向量化并存储在缓存中

raw_documents = TextLoader("letter.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=30)
documents=text_splitter.split_documents(raw_documents)

Created a chunk of size 159, which is longer than the specified 100
Created a chunk of size 276, which is longer than the specified 100


In [12]:
! pip install faiss-cpu



In [13]:
from langchain.vectorstores import FAISS
%timeit -r 1 -n 1 db=FAISS.from_documents(documents, cached_embeddings)  # 用于测量一小段代码的执行时间

1.42 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [14]:
list(cache.yield_keys())

['text-embedding-v1085c801b-edde-5615-98de-001c83b9451e',
 'text-embedding-v15b88e1e6-ab94-5fcd-804c-7bf129f4dad3',
 'text-embedding-v19e934ae9-c70f-554c-a506-6a8b95815d5f',
 'text-embedding-v1e074eca4-d751-59ba-83f7-712631e61bca',
 'text-embedding-v1e6c93325-9e0d-54dd-a37e-9153f6f4d4ff',
 'text-embedding-v1e8c8ac42-d839-53d8-92ea-f01aa655bf74']

### 向量数据库
- 向量数据：用空间描述高维数据，用距离判断亲疏
- 向量数据库处理高位数据具备天然优势
- 是图形处理、推荐系统背后的英雄
- 管理：以原始数据形式处理数据，管理更加有效
- 存储：能够存储向量数据以及AI需要的高维数据
- 检索：可以高效检索数据，AI非常需要的特性
- 让AI具备了记忆能力