In [1]:
!pip install  -q langchain_community pypdf langchain-text-splitters tdqm  sentence_transformers memvectordb-python

[33m  DEPRECATION: tdqm is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from sentence_transformers import SentenceTransformer, util
from memvectordb.collection import MemVectorDB
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

collection_name = "collection_1"
dimension = 3
distance = "cosine"
client = MemVectorDB(base_url = "base-url")
collection = client.create_collection(collection_name, dimension, distance)
collection

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from uuid import uuid4
from tqdm.auto import tqdm


def process_document(document , batch_size: int, chunk_size: int):
    texts = []
    metadatas = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=20,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )

    for i, record in enumerate(tqdm(document)):
        metadata = {
            'content': record.page_content
        }
        record_texts = text_splitter.split_text(record.page_content)
        record_metadatas = [{
            "chunk": str(j), "text": text, **metadata 
        } for j, text in enumerate(record_texts)]
        texts.extend(record_texts)
        metadatas.extend(record_metadatas)

        if len(texts) >= batch_size:
            vector_ids = [str(uuid4()) for _ in range(len(texts))]
            embed = model.encode(texts)
            vectors = embed.tolist()

            for idx, vector_id, vector, metadata in zip(range(len(vector_ids)), vector_ids, vectors, metadatas):
                client.batch_insert_embeddings(collection_name, vector_id, vector, metadata)
            texts = []
            metadatas = []

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path


def load_doc(file_url):
    pages = []
    file_path = Path(file_url)

    file_extension = file_path.suffix
    if file_extension == ".pdf":
        loader = PyPDFLoader(file_url)
        pages = loader.load_and_split()
    return pages

doc = load_doc("https://arxiv.org/pdf/1706.03762.pdf")

In [None]:
batch_size = 30
chunk_size = 256
process_document(doc, batch_size, chunk_size )

In [None]:
query = "what is the attention mechanism?"
query_vector = model.encode(query)
query_vector = query_vector.tolist()
result = client.query(k = 1, collection_name = collection_name, query_vector = query_vector)
result