In [1]:
from pathlib import Path
from rag.ingestion import Ingestion

# Path setup for the documents
ROOT_DIR = Path().resolve().parent
path = ROOT_DIR / 'documents'
print(path)

  from tqdm.autonotebook import tqdm


/Users/matheus/Documents/pessoal/RAG-Survey-Parameter-Exploration/documents


### Creating indexes combinations (I fixed the chunk size to 512 and overlap to 50)

In [2]:
chunking_strategies = ["fixed-size", "content-aware"]
preprocessing_options = ["None", "contextual-embedding"]

similarity_metrics = ["cosine","dotproduct"]
embeddings_options = ["text-embedding-3-large", "text-embedding-ada-002"]

alias = {
    "fixed-size": "fs",
    "context-aware": "ca",
    "cosine": "cos",
    "dotproduct": "dotp",
    "text-embedding-3-large": "3large",
    "text-embedding-ada-002": "ada",
    "contextual-embedding": "contextualemb",
    "None": "none"
}

In [3]:
# Creating the indexes combinating chunking strategies, preprocessing options, similarity metrics and embeddings options.
for chunking_strategy in chunking_strategies:
    for preprocessing_option in preprocessing_options:
        for similarity_metric in similarity_metrics:
            for embeddings_option in embeddings_options:
                    
                index_name = f"{alias[chunking_strategy]}-{alias[preprocessing_option]}-{alias[similarity_metric]}-{alias[embeddings_option]}"
                
                print(f"===Creating index: {index_name}===")
                ingestion = Ingestion(
                    index_name=index_name,
                    chunking_strategy=chunking_strategy,
                    metric=similarity_metric,
                    directory=path,
                    embedding_model_name=embeddings_option
                )
                print(f"===Index sent to pinecone===")

===Creating index: fs-none-cos-3-large===


100%|██████████| 2/2 [00:08<00:00,  4.22s/it]
Created a chunk of size 1290, which is longer than the specified 512
Created a chunk of size 844, which is longer than the specified 512
Created a chunk of size 1198, which is longer than the specified 512
Created a chunk of size 1458, which is longer than the specified 512
Created a chunk of size 630, which is longer than the specified 512
Created a chunk of size 761, which is longer than the specified 512
Created a chunk of size 660, which is longer than the specified 512
Created a chunk of size 744, which is longer than the specified 512
Created a chunk of size 521, which is longer than the specified 512
Created a chunk of size 760, which is longer than the specified 512
Created a chunk of size 652, which is longer than the specified 512
Created a chunk of size 701, which is longer than the specified 512
Created a chunk of size 962, which is longer than the specified 512
Created a chunk of size 1081, which is longer than the specified 51

Index fs-none-cos-3-large created.
===Index sent to pinecone===
===Creating index: fs-none-cos-ada===


100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
Created a chunk of size 1290, which is longer than the specified 512
Created a chunk of size 844, which is longer than the specified 512
Created a chunk of size 1198, which is longer than the specified 512
Created a chunk of size 1458, which is longer than the specified 512
Created a chunk of size 630, which is longer than the specified 512
Created a chunk of size 761, which is longer than the specified 512
Created a chunk of size 660, which is longer than the specified 512
Created a chunk of size 744, which is longer than the specified 512
Created a chunk of size 521, which is longer than the specified 512
Created a chunk of size 760, which is longer than the specified 512
Created a chunk of size 652, which is longer than the specified 512
Created a chunk of size 701, which is longer than the specified 512
Created a chunk of size 962, which is longer than the specified 512
Created a chunk of size 1081, which is longer than the specified 51

Index fs-none-cos-ada created.
===Index sent to pinecone===
===Creating index: fs-none-dotp-3-large===


100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
Created a chunk of size 1290, which is longer than the specified 512
Created a chunk of size 844, which is longer than the specified 512
Created a chunk of size 1198, which is longer than the specified 512
Created a chunk of size 1458, which is longer than the specified 512
Created a chunk of size 630, which is longer than the specified 512
Created a chunk of size 761, which is longer than the specified 512
Created a chunk of size 660, which is longer than the specified 512
Created a chunk of size 744, which is longer than the specified 512
Created a chunk of size 521, which is longer than the specified 512
Created a chunk of size 760, which is longer than the specified 512
Created a chunk of size 652, which is longer than the specified 512
Created a chunk of size 701, which is longer than the specified 512
Created a chunk of size 962, which is longer than the specified 512
Created a chunk of size 1081, which is longer than the specified 51

Index fs-none-dotp-3-large created.
===Index sent to pinecone===
===Creating index: fs-none-dotp-ada===


100%|██████████| 2/2 [00:02<00:00,  1.05s/it]
Created a chunk of size 1290, which is longer than the specified 512
Created a chunk of size 844, which is longer than the specified 512
Created a chunk of size 1198, which is longer than the specified 512
Created a chunk of size 1458, which is longer than the specified 512
Created a chunk of size 630, which is longer than the specified 512
Created a chunk of size 761, which is longer than the specified 512
Created a chunk of size 660, which is longer than the specified 512
Created a chunk of size 744, which is longer than the specified 512
Created a chunk of size 521, which is longer than the specified 512
Created a chunk of size 760, which is longer than the specified 512
Created a chunk of size 652, which is longer than the specified 512
Created a chunk of size 701, which is longer than the specified 512
Created a chunk of size 962, which is longer than the specified 512
Created a chunk of size 1081, which is longer than the specified 51

Index fs-none-dotp-ada created.
===Index sent to pinecone===
===Creating index: fs-context-emb-cos-3-large===


100%|██████████| 2/2 [00:02<00:00,  1.10s/it]
Created a chunk of size 1290, which is longer than the specified 512
Created a chunk of size 844, which is longer than the specified 512
Created a chunk of size 1198, which is longer than the specified 512
Created a chunk of size 1458, which is longer than the specified 512
Created a chunk of size 630, which is longer than the specified 512
Created a chunk of size 761, which is longer than the specified 512
Created a chunk of size 660, which is longer than the specified 512
Created a chunk of size 744, which is longer than the specified 512
Created a chunk of size 521, which is longer than the specified 512
Created a chunk of size 760, which is longer than the specified 512
Created a chunk of size 652, which is longer than the specified 512
Created a chunk of size 701, which is longer than the specified 512
Created a chunk of size 962, which is longer than the specified 512
Created a chunk of size 1081, which is longer than the specified 51

KeyboardInterrupt: 

In [4]:
retrieval_strategies = ["hybrid", "semantic", "exact"]
pre_retrieval_techniques = ["hyde"]
post_retrieval_techniques = ["rerank"]