In [1]:
!pip install  -q langchain_community pypdf langchain-text-splitters tdqm  sentence_transformers memvectordb-python

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.2/121.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.5/142.5 kB[0m [31m3.6

In [6]:
from sentence_transformers import SentenceTransformer, util
from memvectordb.collection import MemVectorDB
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

collection_name = "collection_new"
dimension = 384
distance = "cosine"
client = MemVectorDB(base_url = "base-url")
collection = client.create_collection(collection_name, dimension, distance)
collection

'Collection created: "collection_new"'

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from uuid import uuid4
from tqdm.auto import tqdm

# batch insertion
def insert_document(document , batch_size: int, chunk_size: int):
    texts = []
    metadatas = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=20,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    for i, record in enumerate(tqdm(document)):
        metadata = {
            'content': record.page_content
        }
        record_texts = text_splitter.split_text(record.page_content)
        record_metadatas = [{
            "chunk": str(j), "text": text, **metadata
        } for j, text in enumerate(record_texts)]
        texts.extend(record_texts)
        metadatas.extend(record_metadatas)

        if len(texts) >= batch_size:
            vector_ids = [str(uuid4()) for _ in range(len(texts))]
            embed = model.encode(texts)
            vectors = embed.tolist()

            for idx, vector_id, vector, metadata in zip(range(len(vector_ids)), vector_ids, vectors, metadatas):
                client.batch_insert_embeddings(collection_name, vector_id, vector, metadata)
            texts = []
            metadatas = []

In [8]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path


def load_doc(file_url):
    pages = []
    file_path = Path(file_url)

    file_extension = file_path.suffix
    if file_extension == ".pdf":
        loader = PyPDFLoader(file_url)
        pages = loader.load_and_split()
    return pages

doc = load_doc("https://arxiv.org/pdf/1706.03762.pdf")

In [9]:
batch_size = 30
chunk_size = 256
insert_document(doc, batch_size, chunk_size )

  0%|          | 0/16 [00:00<?, ?it/s]

In [14]:
query = "what is the attention mechanism?"
query_vector = model.encode(query)
query_vector = query_vector.tolist()
result = client.query(k = 1, collection_name = collection_name, query_vector = query_vector)
result

[{'score': 0.65346235,
  'embedding': {'id': 'ea0ac37f-b2cc-483d-b208-25be4a1ae7ae',
   'vector': [-0.02957984,
    -0.07771841,
    -0.02193042,
    -0.0070293294,
    0.05387133,
    -0.073714346,
    0.10997306,
    -0.023968456,
    0.106517926,
    -0.07827948,
    0.062276337,
    0.058000654,
    -0.003868588,
    -0.057311133,
    -0.036128294,
    -0.07089975,
    0.10078816,
    -0.0027015186,
    -0.009905206,
    -0.009741886,
    0.057892,
    -0.0043108054,
    -0.038469676,
    -0.0023321162,
    -0.035815414,
    -0.063471444,
    0.00032989087,
    0.044297166,
    0.027801761,
    0.013771573,
    0.078275904,
    0.03665813,
    0.048109267,
    0.0005472465,
    -0.0742276,
    0.046517864,
    -0.017750213,
    -0.0063350936,
    -0.011431679,
    -0.027803648,
    -0.023567457,
    0.03851022,
    0.050280724,
    -0.0052505885,
    0.01915858,
    0.029644348,
    -0.020597527,
    -0.0017969541,
    0.0007078843,
    -0.021194506,
    -0.0813781,
    -0.00516256