In [1]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

from langchain_community.embeddings import OllamaEmbeddings

In [2]:
from typing import List
from langchain_community.embeddings import OllamaEmbeddings
from torch import Tensor
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

from langchain_core.embeddings import Embeddings

import numpy as np
from torch import Tensor


class Embedder(Embeddings):
    def __init__(self, embedding_size: int = 768) -> None:
        self.embedding_size = embedding_size

        self.tokenizer = AutoTokenizer.from_pretrained(
            "bert-base-uncased", model_max_length=8192
        )
        self.model = AutoModel.from_pretrained(
            pretrained_model_name_or_path="nomic-ai/nomic-embed-text-v1.5",
            trust_remote_code=True,
            safe_serialization=True,
            rotary_scaling_factor=2,
        )
        self.model.eval()

        # If you want to do semantic similarity search instead of question answering,
        # you should encode both queries and document with the search_document task type.
        self.query_types = {
            "search_query": "Use this when you want to encode a query for question-answering over text that was embedded with search_document.",
            "search_document": "The default embedding task type. Any document you want to use for retrieval or store in a vector database should use this task type.",
            "classification": "Use this if your embeddings are for classification (e.g. training a linear probe for a target classification task)",
            "clustering": "Use this if your embeddings need very high linear separability (e.g. building a topic model on your embeddings)",
        }

    def __mean_pooling(self, model_output, attention_mask) -> Tensor:
        token_embeddings = model_output[0]
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def embed(
        self,
        texts: list[str],
        querry: str = "search_document: ",
        return_array=False,
        return_ndarray=False,
    ) -> Tensor:
        """Function for creating embeddings"""

        if isinstance(texts, str):
            texts = [texts]

        texts = [querry + text for text in texts]

        encoded_input = self.tokenizer(
            texts, padding=True, truncation=True, return_tensors="pt"
        )

        with torch.no_grad():
            model_output = self.model(**encoded_input)

        embeddings = self.__mean_pooling(model_output, encoded_input["attention_mask"])
        embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
        embeddings = embeddings[:, : self.embedding_size]
        embeddings = F.normalize(embeddings, p=2, dim=1)

        if return_ndarray:
            return embeddings.numpy()

        if return_array:
            embedded_list = embeddings.tolist()
            if len(embedded_list) == 1:
                embedded_list = embedded_list[0]
            return embedded_list

        return embeddings

    def _get_embedding(self, texts: str, querry: str) -> Tensor:
        return self.embed(texts=texts, return_array=True)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed search docs. Implemented for LangChain vector store campatability"""
        return self._get_embedding(texts=texts, querry="search_document:")

    def embed_query(self, text: str) -> List[float]:
        """Embed query text.Implemented for LangChain vector store campatability"""
        return self._get_embedding(texts=text, querry="search_query:")

In [3]:
embedder = Embedder()
ans = embedder.embed(
    [
        "Embed query text. Uses embed method. Implemented for LangChain vector store campatability"
    ],
    return_array=True,
)

<All keys matched successfully>


In [4]:
ans

[-0.00678537180647254,
 0.02901778370141983,
 -0.16269497573375702,
 -0.04146111011505127,
 0.06222774460911751,
 -0.04898751154541969,
 -0.026256415992975235,
 0.008826125413179398,
 -0.03914357349276543,
 -0.04495925083756447,
 -0.011976683512330055,
 -0.007809306029230356,
 0.08151137828826904,
 -0.008600431494414806,
 -0.04255734756588936,
 -0.005486391484737396,
 0.02238943800330162,
 -0.04422157257795334,
 -0.015798095613718033,
 0.05769278481602669,
 -0.036574169993400574,
 0.0032914902549237013,
 -0.024733664467930794,
 -0.04281643033027649,
 0.0465269461274147,
 -0.0015061023877933621,
 0.008406109176576138,
 0.04273141175508499,
 -0.0440555214881897,
 -0.01648848131299019,
 -0.004127610009163618,
 0.00755061861127615,
 0.009363479912281036,
 -0.05960777401924133,
 0.006638789549469948,
 -0.049402251839637756,
 -0.013142879121005535,
 0.019680911675095558,
 -0.031365327537059784,
 -0.0024004513397812843,
 0.025910302996635437,
 0.02590814046561718,
 0.01851302571594715,
 -0.01

In [5]:
type(ans), type(ans[0])

(list, float)

In [6]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

recursice_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

loader = TextLoader("test_txt.txt")
documents = loader.load()

texts = recursice_splitter.split_documents(documents)

In [7]:
from langchain.retrievers.multi_vector import MultiVectorRetriever

vector_store = Chroma(
    embedding_function=embedder,
    persist_directory=".VectorData/chroma_db",
    collection_name="Chroma-Nomic-768",
)


store = InMemoryStore()

retriever = MultiVectorRetriever(
  vectorstore=vector_store,
  docstore=store,
)  

In [8]:
retriever.vectorstore.add_documents(
    texts
)

['4f04bb19-6517-499a-8316-0b79a5549013',
 '1f804cac-c68c-4f9c-a9fa-8b601cf370df',
 'aa9306a3-74fc-4ccc-9e88-ffef94d89a76',
 '215558fa-4f16-4103-a15c-1aa0658af1e7',
 'fa749199-5e46-429a-a9e2-652c53961986',
 '544e3e7e-9438-4b75-927d-f2f24894200e',
 'a0788504-a54f-4c3f-90d3-49c097dea9ba',
 '38f62a6d-0edd-4b75-9bb1-f71c902b89e0',
 'c93b0ca5-738a-4835-8851-ffd3ac11412e',
 'b3906230-01b3-4220-80ab-c5187ba9e6c2',
 '8df466f7-4375-4a46-aaa1-bb74b9ccbc64',
 '61bcdd77-fc7b-4015-9d00-b6bc5130561f',
 'f2f37dd7-396d-4b77-9393-1f046317829a',
 '7d691a9e-b016-47dc-8281-a46988241d45',
 '7e5bb9d0-e7f3-467b-8bea-73819cd9143b',
 '3e931472-a05e-42b9-b71a-9767fa18838b',
 'e203a4f3-9a92-4f78-87e0-385641b7f021',
 '684f267d-8ccb-4dcd-ba55-502a3ecdd7e5',
 '25f4fdb8-634e-4a50-a062-f52e14e6f775',
 '923e07ad-da62-4838-923f-30f8b9bee05e',
 '34c74b41-b39c-4243-aecc-5608814dd0ef',
 '3fc7b0a3-71d8-4d53-9f20-0a0a2b072ea6',
 '99131a33-ffb0-41a0-bd85-c8d6ee9812f1',
 '63735c77-6ae2-4ecb-800c-ae62f9d06b1c',
 'fddf59cc-d597-

In [9]:
from langchain.retrievers.multi_vector import SearchType

In [10]:
query_1 = "Unlike Knights Radiant, each Fused can only access a single surge"

In [11]:
retriever.vectorstore.similarity_search(query_1) 

[Document(page_content='The Knights Radiant\nThe Knights Radiant originated through spren copying the abilities which the Heralds obtained through their Honorblades. The Knights Radiant gained power through spren by creating a bond with them called the Nahel bond. The bond gives the spren sentience while giving the human Surgebinding abilities. Two examples are Sylphrena, an Honorspren, who shares a bond with Kaladin, giving him the power to Surgebind; and Pattern, a Liespren (Cryptic), who shares a bond with Shallan, granting her power to Soulcast and create Illusions.', metadata={'source': 'test_txt.txt'}),
 Document(page_content="Voidbinding:\nSimilar to Surgebinding, Voidbinding has a collection of surges that the third god of Roshar, Odium, makes available to his selected servants, called Fused. Unlike Knights Radiant, each Fused can only access a single surge, from a list of nine: Gravitation, Division, Abrasion, Progression, Illumination, Transformation, Transportation, Cohesion

In [12]:
retriever.vectorstore.similarity_search_with_relevance_scores(query_1) 

[(Document(page_content='The Knights Radiant\nThe Knights Radiant originated through spren copying the abilities which the Heralds obtained through their Honorblades. The Knights Radiant gained power through spren by creating a bond with them called the Nahel bond. The bond gives the spren sentience while giving the human Surgebinding abilities. Two examples are Sylphrena, an Honorspren, who shares a bond with Kaladin, giving him the power to Surgebind; and Pattern, a Liespren (Cryptic), who shares a bond with Shallan, granting her power to Soulcast and create Illusions.', metadata={'source': 'test_txt.txt'}),
  0.6632937625158462),
 (Document(page_content="Voidbinding:\nSimilar to Surgebinding, Voidbinding has a collection of surges that the third god of Roshar, Odium, makes available to his selected servants, called Fused. Unlike Knights Radiant, each Fused can only access a single surge, from a list of nine: Gravitation, Division, Abrasion, Progression, Illumination, Transformation,

In [13]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

retriever2 = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"score_threshold": 0.5}
)

In [14]:
b = retriever.invoke(query_1)

In [15]:
a = retriever2.invoke(query_1)

In [16]:
a, b

([Document(page_content='The Knights Radiant\nThe Knights Radiant originated through spren copying the abilities which the Heralds obtained through their Honorblades. The Knights Radiant gained power through spren by creating a bond with them called the Nahel bond. The bond gives the spren sentience while giving the human Surgebinding abilities. Two examples are Sylphrena, an Honorspren, who shares a bond with Kaladin, giving him the power to Surgebind; and Pattern, a Liespren (Cryptic), who shares a bond with Shallan, granting her power to Soulcast and create Illusions.', metadata={'source': 'test_txt.txt'}),
  Document(page_content="Voidbinding:\nSimilar to Surgebinding, Voidbinding has a collection of surges that the third god of Roshar, Odium, makes available to his selected servants, called Fused. Unlike Knights Radiant, each Fused can only access a single surge, from a list of nine: Gravitation, Division, Abrasion, Progression, Illumination, Transformation, Transportation, Cohesi