In [6]:
import sys
import os
from dotenv import load_dotenv

load_dotenv()

# Get the current working directory
current_dir = os.getcwd()

# Assuming your notebook is in a subdirectory of the main project folder,
# go up two levels to reach the project root
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..'))
sys.path.append(project_root)

## reference implementation


In [2]:
from transformers.utils import TRANSFORMERS_CACHE
print(f"Transformers cache directory: {TRANSFORMERS_CACHE}")

Transformers cache directory: /Users/jan/.cache/huggingface/hub


In [None]:
def late_chunking(
    model_output: 'BatchEncoding', span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs

def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer
    :param input_text: The text snippet to split into sentences
    :param tokenizer: The tokenizer to use
    :return: A tuple containing the list of text chunks and their corresponding token spans
    """
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

In [None]:
input_text = "Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."

# determine chunks
chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)
print('Chunks:\n- "' + '"\n- "'.join(chunks) + '"')

# chunk afterwards (context-sensitive chunked pooling)
inputs = tokenizer(input_text, return_tensors='pt')
model_output = model(**inputs)
embeddings = late_chunking(model_output, [span_annotations])[0]

## test implementation

In [1]:
import sys
import os
from dotenv import load_dotenv

load_dotenv()

# Get the current working directory
current_dir = os.getcwd()

# Assuming your notebook is in a subdirectory of the main project folder,
# go up two levels to reach the project root
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..'))
sys.path.append(project_root)


from app.doc_processing import process_doc, ProcessDocConfig
from app.vectorstore import get_chroma_store_as_retriever, add_docs_to_store
import os

pdf_path = '/Users/jan/Desktop/advanced_rag/dev_tests/test_data/el_nino.pdf'
#pdf_path = '/Users/jan/Desktop/advanced_rag/dev_tests/test_data/attention_is_all.pdf'
tag = "attention"
table_processing = ['Header', 'Footer', 'Image', 'FigureCaption', 'Formula']
config = ProcessDocConfig(
        tag=tag,
        filepath=pdf_path,
        unwanted_categories_list=table_processing
    )

    # Process the document
processed_docs = process_doc(config)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
from app.doc_processing.late_chunking import apply_late_chunking
docs = apply_late_chunking(processed_docs)

In [3]:
from app.vectorstore.experimental import get_faiss_store_as_retriever, add_docs_to_faiss_store
retriever = get_faiss_store_as_retriever()
add_docs_to_faiss_store(retriever, docs)

Creating new FAISS index
New FAISS index saved to app/data/FAISS_STORE
FAISS index saved to app/data/FAISS_STORE


In [9]:
from app.chat import create_RAG_output, get_result_docs, ChatConfig
import app.llm

config = ChatConfig(
            tag=tag,
            k=10,
            llm=app.llm.get_groq_llm(),
            expand_by_answer=False,
            expand_by_mult_queries=False,
            reranking=False,
            use_bm25=False
    )
config.history_awareness(False)

query = "Which BLEU score did the transformer base-model achieve?"
query = "what is el nino?"
result_docs, _ = get_result_docs(config, query, retriever=retriever)
context = ''.join(result_docs)
final_answer = create_RAG_output(context, query, config.llm)

attention




In [11]:
final_answer

'El Niño is a natural phenomenon in the tropical Pacific that influences weather around the globe. It causes changes in the jet stream that can point storms directly at California.'

In [10]:
result_docs

['The soaking storms will raise the ﬂood threat across much of California into next week, but it appears the wet pattern is likely to continue well into February as a more typical El Niño pattern kicks into gear.\n\nEl Niño – a natural phenomenon in the tropical Paciﬁc that inﬂuences weather around the globe – causes changes in the jet stream that can point storms directly at California. Storms can also tap into an extra-potent supply of moisture from the tropics called an atmospheric river.',
 'A potent pair of atmospheric rivers will drench California as El Niño makes its ﬁrst mark on winter\n\nBy Mary Gilbert, CNN Meteorologist\n\nUpdated: 3:49 PM EST, Tue January 30, 2024\n\nSource: CNN\n\nA potent pair of atmospheric river-fueled storms are about to unleash a windy and incredibly wet week in California in what is the ﬁrst clear sign of the inﬂuence El Niño was expected to have on the state this winter.',
 'El Niño hasn’t materialized many atmospheric rivers for California so far t

## retrieval test -> Berlin

In [1]:
def get_span_annotations(docs, tokenizer):
    """
    Generate span annotations for a list of documents.

    Args:
        docs (list): List of document objects.
        tokenizer: The tokenizer to use for tokenizing document content.

    Returns:
        list: List of (start, end) tuples representing span annotations.
    """
    span_annotations = []
    start = 0
    for doc in docs:
        doc_tokens = tokenizer(doc.page_content, return_tensors='pt', add_special_tokens=False)
        end = start + len(doc_tokens['input_ids'][0])
        span_annotations.append((start, end))
        start = end
    return span_annotations

def chunk_by_sentences(input_text: str, tokenizer: callable):
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks

from langchain.schema import Document

def add_sentences_to_chroma(retriever, sentences, tag):
    docs = [Document(page_content=sentence, metadata={"tag": tag}) for sentence in sentences]
    retriever.vectorstore.add_documents(docs)

In [None]:
from app.doc_processing.late_chunking import chunk_by_sentences, apply_late_chunking
from app.vectorstore import add_sentences_to_chroma, get_chroma_store_as_retriever
from app.vectorstore.experimental import get_faiss_store_as_retriever, add_late_chunked_docs_to_faiss
from transformers import AutoTokenizer

input_text = "Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."

def process_and_store_text(input_text: str, tag: str):
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)

    # Split into sentences
    sentences = chunk_by_sentences(input_text, tokenizer)

    # Store sentences in Chroma
    chroma_retriever = get_chroma_store_as_retriever()
    add_sentences_to_chroma(chroma_retriever, sentences, tag)

    # Apply late chunking
    late_chunked_docs = apply_late_chunking([Document(page_content=sentence) for sentence in sentences])

    # Store late-chunked embeddings in FAISS
    faiss_retriever = get_faiss_store_as_retriever()
    add_late_chunked_docs_to_faiss(faiss_retriever, late_chunked_docs)

    print(f"Processed and stored {len(sentences)} sentences with tag '{tag}'")

In [3]:
from transformers import AutoTokenizer
input_text = "Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True)

sentences = chunk_by_sentences(input_text, tokenizer)


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from langchain_chroma import Chroma
import os
from uuid import uuid4
import json
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_core.documents import Document
from app.vectorstore.embeddings import get_ollama_embeddings, JinaEmbeddings
from dotenv import load_dotenv

load_dotenv()
chroma_retriever = Chroma(persist_directory=os.getenv("CHROMA_PATH"),
                          embedding_function=JinaEmbeddings()).as_retriever()
add_sentences_to_chroma(chroma_retriever, sentences, 'test')

In [14]:
from app.vectorstore.experimental import get_faiss_store_as_retriever, add_docs_to_faiss_store
from app.doc_processing.late_chunking import apply_late_chunking

docs = [Document(page_content=sentence, metadata={"tag": 'test', 'source': 'Berlin.txt'}) for sentence in sentences]
late_chunked_docs = apply_late_chunking(docs)

faiss_retriever = get_faiss_store_as_retriever()
add_docs_to_faiss_store(faiss_retriever, late_chunked_docs)


Creating new FAISS index
New FAISS index saved to app/data/FAISS_STORE
FAISS index saved to app/data/FAISS_STORE


In [16]:
query = "capital"

# Perform similarity search with score
chroma_results = chroma_retriever.vectorstore.similarity_search_with_score(query, k=5)
faiss_results = faiss_retriever.vectorstore.similarity_search_with_score(query, k=5)

# Print results
print("Chroma Results:")
for doc, score in chroma_results:
    print(f"Score: {score:.4f}, Content: {doc.page_content[:100]}...")

print("\nFAISS Results:")
for doc, score in faiss_results:
    print(f"Score: {score:.4f}, Content: {doc.page_content[:100]}...")

Number of requested results 5 is greater than number of elements in index 3, updating n_results = 3


Chroma Results:
Score: 57.7374, Content: Berlin is the capital and largest city of Germany, both by area and by population....
Score: 63.2674, Content:  Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured...
Score: 63.5306, Content:  The city is also one of the states of Germany, and is the third smallest state in the country in te...

FAISS Results:
Score: 60.5736, Content:  The city is also one of the states of Germany, and is the third smallest state in the country in te...
Score: 61.0898, Content:  Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured...
Score: 61.7908, Content: Berlin is the capital and largest city of Germany, both by area and by population....


## test comparison on pdf

In [17]:
import sys
import os
from dotenv import load_dotenv

load_dotenv()

# Get the current working directory
current_dir = os.getcwd()

# Assuming your notebook is in a subdirectory of the main project folder,
# go up two levels to reach the project root
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..'))
sys.path.append(project_root)


from app.doc_processing import process_doc, ProcessDocConfig
from app.vectorstore import get_chroma_store_as_retriever, add_docs_to_store
import os

pdf_path = '/Users/jan/Desktop/advanced_rag/dev_tests/test_data/el_nino.pdf'
#pdf_path = '/Users/jan/Desktop/advanced_rag/dev_tests/test_data/attention_is_all.pdf'
tag = "attention"
table_processing = ['Header', 'Footer', 'Image', 'FigureCaption', 'Formula']
config = ProcessDocConfig(
        tag=tag,
        filepath=pdf_path,
        unwanted_categories_list=table_processing
    )

    # Process the document
processed_docs = process_doc(config)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
from app.vectorstore import get_chroma_store_as_retriever, add_docs_to_store
from app.vectorstore.embeddings import get_ollama_embeddings, JinaEmbeddings
from dotenv import load_dotenv

load_dotenv()

# attention: uses ollama embeddings on default
chroma_retriever = get_chroma_store_as_retriever(embeddings=JinaEmbeddings())
add_docs_to_store(chroma_retriever, processed_docs)

In [19]:
from app.doc_processing.late_chunking import apply_late_chunking
from app.vectorstore.experimental import get_faiss_store_as_retriever, add_docs_to_faiss_store

docs = apply_late_chunking(processed_docs)
faiss_retriever = get_faiss_store_as_retriever()
add_docs_to_faiss_store(faiss_retriever, docs)

Creating new FAISS index
New FAISS index saved to app/data/FAISS_STORE
FAISS index saved to app/data/FAISS_STORE


In [23]:
query = "California"

# Perform similarity search with score
chroma_results = chroma_retriever.vectorstore.similarity_search_with_score(query, k=5)
faiss_results = faiss_retriever.vectorstore.similarity_search_with_score(query, k=5)

# Print results
print("Chroma Results:")
for doc, score in chroma_results:
    print(f"Score: {score:.4f}, Content: {doc.page_content[:100]}...")

print("\nFAISS Results:")
for doc, score in faiss_results:
    print(f"Score: {score:.4f}, Content: {doc.page_content[:100]}...")

Chroma Results:
Score: 50.9058, Content: The storm will start out very warm – fueled by moisture from near Hawaii that earns it the moniker o...
Score: 52.4925, Content: El Niño hasn’t materialized many atmospheric rivers for California so far this winter, with most hit...
Score: 54.2004, Content: The snow is welcomed for California’s snowpack, which has been beleaguered by warmth and storms that...
Score: 54.7581, Content: Go to the full CNN experience

© 2024 Cable News Network. A Warner Bros. Discovery Company. All Righ...
Score: 54.9169, Content: Showery weather will linger across much of California Friday as moisture slowly pushes out and acros...

FAISS Results:
Score: 57.9215, Content: A potent pair of atmospheric rivers will drench California as El Niño makes its ﬁrst mark on winter
...
Score: 68.2715, Content: El Niño hasn’t materialized many atmospheric rivers for California so far this winter, with most hit...
Score: 72.3097, Content: The soaking storms will raise the ﬂood t