In [3]:
from langchain_community.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

from langchain_community.embeddings import OllamaEmbeddings

In [12]:
from typing import List
from langchain_community.embeddings import OllamaEmbeddings
from torch import Tensor
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

from langchain_core.embeddings import Embeddings

import numpy as np
from torch import Tensor


class Embedder(Embeddings):
    def __init__(self, embedding_size: int = 768) -> None:
        self.embedding_size = embedding_size

        self.tokenizer = AutoTokenizer.from_pretrained(
            "bert-base-uncased", model_max_length=8192
        )
        self.model = AutoModel.from_pretrained(
            pretrained_model_name_or_path="nomic-ai/nomic-embed-text-v1.5",
            trust_remote_code=True,
            safe_serialization=True,
            rotary_scaling_factor=2,
        )
        self.model.eval()

        # If you want to do semantic similarity search instead of question answering,
        # you should encode both queries and document with the search_document task type.
        self.query_types = {
            "search_query": "Use this when you want to encode a query for question-answering over text that was embedded with search_document.",
            "search_document": "The default embedding task type. Any document you want to use for retrieval or store in a vector database should use this task type.",
            "classification": "Use this if your embeddings are for classification (e.g. training a linear probe for a target classification task)",
            "clustering": "Use this if your embeddings need very high linear separability (e.g. building a topic model on your embeddings)",
        }

    def __mean_pooling(self, model_output, attention_mask) -> Tensor:
        token_embeddings = model_output[0]
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def embed(
        self,
        texts: list[str],
        querry: str = "search_document: ",
        return_array=False,
        return_ndarray=False,
    ) -> Tensor:
        """Function for creating embeddings"""

        print(type(texts), texts)

        if isinstance(texts, str):
            texts = [texts]

        texts = [querry + text for text in texts]

        encoded_input = self.tokenizer(
            texts, padding=True, truncation=True, return_tensors="pt"
        )

        with torch.no_grad():
            model_output = self.model(**encoded_input)

        embeddings = self.__mean_pooling(model_output, encoded_input["attention_mask"])
        embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
        embeddings = embeddings[:, : self.embedding_size]
        embeddings = F.normalize(embeddings, p=2, dim=1)

        if return_ndarray:
            return embeddings.numpy()

        if return_array:
            embedded_list = embeddings.tolist()
            if len(embedded_list) == 1:
                embedded_list = embedded_list[0]
            return embedded_list

        return embeddings

    def _get_embedding(self, texts: str, querry: str) -> Tensor:
        return self.embed(texts=texts, return_array=True)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed search docs. Implemented for LangChain vector store campatability"""
        return self._get_embedding(texts=texts, querry="search_document:")

    def embed_query(self, text: str) -> List[float]:
        """Embed query text.Implemented for LangChain vector store campatability"""
        return self._get_embedding(texts=text, querry="search_query:")

In [13]:
embedder = Embedder()
ans = embedder.embed(
    [
        "Embed query text. Uses embed method. Implemented for LangChain vector store campatability"
    ],
    return_array=True,
)

<All keys matched successfully>


<class 'list'> ['Embed query text. Uses embed method. Implemented for LangChain vector store campatability']


In [7]:
type(ans), type(ans[0])

(list, float)

In [22]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

recursice_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

loader = TextLoader("test_txt.txt")
documents = loader.load()

texts = recursice_splitter.split_documents(documents)
s = texts[0]
s

Document(page_content='World', metadata={'source': 'test_txt.txt'})

In [27]:
from langchain.retrievers.multi_vector import MultiVectorRetriever

vector_store = Chroma(
    embedding_function=embedder,
    persist_directory=".VectorData/chroma_db",
    collection_name="Chroma-Nomic-768",
)


store = InMemoryStore()

retriever = MultiVectorRetriever(
    vectorstore=vector_store,
    docstore=store,
)

In [47]:
from langchain_core.documents import Document

d = Document(
    page_content="World",
    metadata={"source": "test_txt.txt"},
)
print(d.page_content, s.page_content)
print(d.metadata, s.metadata)
print(type(texts))

World World
{'source': 'test_txt.txt'} {'source': 'test_txt.txt'}
<class 'list'>


In [49]:
d_ = recursice_splitter.split_documents([d])

In [51]:
retriever.vectorstore.add_documents(d_)

<class 'list'> ['World']


InvalidDimensionException: Embedding dimension 1 does not match collection dimensionality 768

In [None]:
retriever.vectorstore.add_documents(texts)

['7a1f4aea-e258-4c6e-9b53-428592df9659',
 '57e9a072-d140-45b0-8116-fb965490c988',
 '9a53042a-11af-48f5-b584-6b874038a187',
 '921857db-d2fa-4ed8-b87f-6a598a146667',
 '543ce410-beee-4b0d-9c03-fdeb94c66507',
 'a0aa0c88-7839-4f83-b708-7a8de44bcd8a',
 '01022031-7fb2-4c57-bb73-7c7457b3c5b3',
 '1a8eb18e-158b-44f7-985a-55d4f0900616',
 'dc4c139f-df24-4321-a6ab-9dba53640633',
 'f042a29f-2121-4664-968b-6a1390d52601',
 '0987804b-63b3-45db-9c20-bf67e9c313f7',
 'b4a3eeaf-4ba4-4a3d-8119-c7264154f88c',
 '40038677-a56e-4138-8c80-8c5e8404b4ee',
 '6d1cd6df-f8f7-4bbd-82a8-98784c4a2b43',
 'c8210fc4-8e66-42d1-b7e6-5191cdf3d9f3',
 '13d1496b-53d3-4fa0-a8c2-dcb8b85ea1ae',
 'c2d70666-cbd9-491b-8d97-8f5ee1a154a6',
 '62e11ae8-2a29-4c05-83a7-c61b8a61981f',
 'dbf452c8-2211-4c40-aeb3-f2389180e09a',
 '4bbc6f2a-fe23-4c61-8de8-098e40e5c07d',
 '89987237-7c4b-4b56-bbd5-97910cd72869',
 '4f015cdb-e1f8-49cd-a0ff-c7c91eab070d',
 'aa0339a0-7a18-4670-8f0c-c7920c1d1fc7',
 '805f3609-1295-48a8-b267-8344140aa18c',
 'bda0fddd-e8e5-

In [None]:
from langchain.retrievers.multi_vector import SearchType

In [None]:
query_1 = "Unlike Knights Radiant, each Fused can only access a single surge"

In [None]:
retriever.vectorstore.similarity_search(query_1)

[Document(page_content='The Knights Radiant\nThe Knights Radiant originated through spren copying the abilities which the Heralds obtained through their Honorblades. The Knights Radiant gained power through spren by creating a bond with them called the Nahel bond. The bond gives the spren sentience while giving the human Surgebinding abilities. Two examples are Sylphrena, an Honorspren, who shares a bond with Kaladin, giving him the power to Surgebind; and Pattern, a Liespren (Cryptic), who shares a bond with Shallan, granting her power to Soulcast and create Illusions.', metadata={'source': 'test_txt.txt'}),
 Document(page_content="Voidbinding:\nSimilar to Surgebinding, Voidbinding has a collection of surges that the third god of Roshar, Odium, makes available to his selected servants, called Fused. Unlike Knights Radiant, each Fused can only access a single surge, from a list of nine: Gravitation, Division, Abrasion, Progression, Illumination, Transformation, Transportation, Cohesion

In [None]:
retriever.vectorstore.similarity_search_with_relevance_scores(query_1)

[(Document(page_content='The Knights Radiant\nThe Knights Radiant originated through spren copying the abilities which the Heralds obtained through their Honorblades. The Knights Radiant gained power through spren by creating a bond with them called the Nahel bond. The bond gives the spren sentience while giving the human Surgebinding abilities. Two examples are Sylphrena, an Honorspren, who shares a bond with Kaladin, giving him the power to Surgebind; and Pattern, a Liespren (Cryptic), who shares a bond with Shallan, granting her power to Soulcast and create Illusions.', metadata={'source': 'test_txt.txt'}),
  0.6632937625158462),
 (Document(page_content="Voidbinding:\nSimilar to Surgebinding, Voidbinding has a collection of surges that the third god of Roshar, Odium, makes available to his selected servants, called Fused. Unlike Knights Radiant, each Fused can only access a single surge, from a list of nine: Gravitation, Division, Abrasion, Progression, Illumination, Transformation,

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

retriever2 = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"score_threshold": 0.5}
)

In [None]:
b = retriever.invoke(query_1)

In [None]:
a = retriever2.invoke(query_1)

In [None]:
a, b

([Document(page_content='The Knights Radiant\nThe Knights Radiant originated through spren copying the abilities which the Heralds obtained through their Honorblades. The Knights Radiant gained power through spren by creating a bond with them called the Nahel bond. The bond gives the spren sentience while giving the human Surgebinding abilities. Two examples are Sylphrena, an Honorspren, who shares a bond with Kaladin, giving him the power to Surgebind; and Pattern, a Liespren (Cryptic), who shares a bond with Shallan, granting her power to Soulcast and create Illusions.', metadata={'source': 'test_txt.txt'}),
  Document(page_content="Voidbinding:\nSimilar to Surgebinding, Voidbinding has a collection of surges that the third god of Roshar, Odium, makes available to his selected servants, called Fused. Unlike Knights Radiant, each Fused can only access a single surge, from a list of nine: Gravitation, Division, Abrasion, Progression, Illumination, Transformation, Transportation, Cohesi