In [1]:
import sys
import os
from dotenv import load_dotenv

current_dir = os.getcwd()
env_path = os.path.abspath(os.path.join(current_dir, '..', 'app', '.env'))
load_dotenv(env_path)

project_root = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(project_root)

## Function imports

In [None]:
from app.vectorstore.embeddings import JinaEmbeddings
from app.vectorstore import get_chroma_store_as_retriever, add_docs_to_store
from app.vectorstore.experimental import get_faiss_store_as_retriever, add_docs_to_faiss_store
from app.doc_processing.late_chunking import apply_late_chunking

## Berlin documet test

This is the test, which the developers from Jina Ai ran in there colab notebook (https://colab.research.google.com/drive/15vNZb6AsU7byjYoaEtXuNu567JWNzXOz?usp=sharing) redone with my own implemented classes and functions, to implement the late chuking approach into the langchain framework.


Here are some functions from the reference implementation. We use them later.

In [2]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer
    :param input_text: The text snippet to split into sentences
    :param tokenizer: The tokenizer to use
    :return: A tuple containing the list of text chunks and their corresponding token spans
    """
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

def late_chunking_reference(
    model_output: 'BatchEncoding', span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs

In [3]:
# test document
input_text = "Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."
# langchain embedding class wrapper around huggingface tokenizer and model
# note: i use the small model instead of the base model because of cpu limitations
embeddings = JinaEmbeddings()
sentences, span_annotations = chunk_by_sentences(input_text, embeddings.tokenizer)


  from .autonotebook import tqdm as notebook_tqdm


Now we have to convert the list of sentences into a list of langchain documents,
to use my functions from the module app.vectorstore.

In [4]:
from langchain.schema import Document
docs = [Document(page_content=sentence, metadata={"tag": 'test_berlin', "source": "berlin.txt"}) for sentence in sentences]

Default approach, without late chunking. (Chunking -> Embedding -> Vector Store)

In [5]:
chroma_retriever = get_chroma_store_as_retriever(embeddings=embeddings)
add_docs_to_store(chroma_retriever, docs)

New late chunking approach. (Embedding -> Chunking -> Vector Store)

Its nessecary to use a FAISS index here, because it is impossible to add already existing embeddings to a chroma store.
At least i didn't found a way to do it.
The FAISS index has a method called add_embeddings, which i have used here. You can see the implementation in the add_docs_to_faiss_store method.

In [6]:
late_chunked_docs = apply_late_chunking(docs)
faiss_retriever = get_faiss_store_as_retriever()
add_docs_to_faiss_store(faiss_retriever, late_chunked_docs)

Creating new FAISS index
New FAISS index saved to app/data/FAISS_STORE
FAISS index saved to app/data/FAISS_STORE


Now lets test the retrievers!

In [11]:
query = "Berlin"

chroma_results = chroma_retriever.vectorstore.similarity_search_with_score(query, k=3)
faiss_results = faiss_retriever.vectorstore.similarity_search_with_score(query, k=3)


print("Chroma Results:")
for doc, score in chroma_results:
    print(f"Score: {score:.4f}, Content: {doc.page_content[:100]}...")

print("\nFAISS Results:")
for doc, score in faiss_results:
    print(f"Score: {score:.4f}, Content: {doc.page_content[:100]}...")

Chroma Results:
Score: 33.3837, Content: Berlin is the capital and largest city of Germany, both by area and by population....
Score: 50.2768, Content:  The city is also one of the states of Germany, and is the third smallest state in the country in te...
Score: 60.6438, Content:  Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured...

FAISS Results:
Score: 29.3441, Content: Berlin is the capital and largest city of Germany, both by area and by population....
Score: 41.2074, Content:  Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured...
Score: 44.7700, Content:  The city is also one of the states of Germany, and is the third smallest state in the country in te...


Hmm, That is quite underwhelming.
I think its time to do some comparison tests with the reference implementation.

## Tests

Here we do a little check, if the created wrapper for the jina embeddings model is working correctly, by compairing my implementation with the reference implementation from the jina authors.

In [8]:
import numpy as np
from transformers import AutoModel
from transformers import AutoTokenizer


# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)

berlin_embedding_ref = model.encode('Berlin')
berlin_embedding_own = embeddings.embed_query("Berlin")

cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

print(f'The embedding similarity between the own implementation and the reference implementation is {cos_sim(berlin_embedding_ref, berlin_embedding_own)} percent')


The embedding similarity between the own implementation and the reference implementation is 0.9999999457983118 percent


Alright. That looks quit good so far. Lets check the late chunking implementation. We have already calculated the span_annotations
with the function above (chunk_by_sentences).

In [18]:
inputs = tokenizer(input_text, return_tensors='pt')
model_output = model(**inputs)
embeddings_ref = late_chunking_reference(model_output, [span_annotations])[0]

Lets compare them.

In [19]:
late_chunked_embeddings = [doc.metadata['embedding'] for doc in late_chunked_docs]

# Compare embeddings
for i, (ref_embedding, own_embedding) in enumerate(zip(embeddings_ref, late_chunked_embeddings)):
    similarity = cos_sim(ref_embedding, own_embedding)
    print(f"Sentence {i+1} similarity: {similarity:.4f}")

Sentence 1 similarity: 0.9997
Sentence 2 similarity: 1.0000
Sentence 3 similarity: 1.0000


They look also quit good, so i suggest my implementation works fine! So it might be like the following:

* The cosine similarity between late chunked embeddings and the search query is actually higher than the similarity between the regular embedded chunks and the query.

* That seems to be irrelevant for the langchain framework, because the vectorstores dont seems to use the cosine similarity, to search related chunks. (Based on my observations in this notebook and my runned test it looks like it)

**Conclusion**

A custom vectorstore object is needed, which uses the cosine similariy to determine, if documents are relevant for the search query

## Implementing custom similarity search for vectorstore object

Do it again with the custom implementation

In [None]:
faiss_retriever = get_faiss_store_as_retriever(custom=True)

In [11]:
import numpy as np
query = "Berlin"

# Calculate query embedding
query_embedding = embeddings.embed_query(query)

chroma_results = chroma_retriever.vectorstore.similarity_search_with_score(query, k=3)

# new implemented method, is also called when called similarity_search
faiss_results = faiss_retriever.vectorstore.similarity_search_by_cosine(query, k=3)

def cos_sim(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

print("Chroma Results:")
for doc, score in chroma_results:
    doc_embedding = embeddings.embed_documents([doc.page_content])[0]
    cosine_similarity = cos_sim(query_embedding, doc_embedding)
    print(f"Cosine Similarity: {cosine_similarity:.4f}, Content: {doc.page_content[:100]}...")

print("\nFAISS Results:")
for doc, score in faiss_results:
    print(f"Cosine Similarity: {score:.4f}, Content: {doc.page_content[:100]}...")

Chroma Results:
Cosine Similarity: 0.8802, Content: Berlin is the capital and largest city of Germany, both by area and by population....
Cosine Similarity: 0.7919, Content:  The city is also one of the states of Germany, and is the third smallest state in the country in te...
Cosine Similarity: 0.7413, Content:  Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured...

FAISS Results:
Cosine Similarity: 0.8857, Content: Berlin is the capital and largest city of Germany, both by area and by population....
Cosine Similarity: 0.8516, Content:  The city is also one of the states of Germany, and is the third smallest state in the country in te...
Cosine Similarity: 0.8478, Content:  Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured...


**Note:**

The custom implementation doesnt actually uses the cosine similarity, to search for the most similar chunks. That is, because it seems to be not easy to overwrite the actual search method from the vectorstore. It presearches for oversampling_factor * k similar documents, with the default search method, rerank them based on the cosine similaritys and returns the k documents with the highest similaritys.