## Data Ingest Optimization Demo - Simple

In [None]:
# First clone the RAGBuilder repo
!git clone https://github.com/KruxAI/ragbuilder.git

In [1]:
import ragbuilder.data_ingest.optimization as data_ingest

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
demo_config = {
    "input_source": "lillog_agents.pdf",
    "test_dataset": "rag_test_data_lilianweng_gpt-4o_1721032414.736622_SEMI.csv",
}

In [3]:
best_config, best_score, best_indexer = data_ingest.run_optimization_from_dict(demo_config)

[I 2024-11-13 00:56:55,919] A new study created in memory with name: data_ingest_1731439614482


  0%|          | 0/10 [00:00<?, ?it/s]

Output()

[I 2024-11-13 00:57:11,857] Trial 0 finished with value: 0.7791137912869454 and parameters: {'chunk_size': 2500}. Best is trial 0 with value: 0.7791137912869454.


[I 2024-11-13 00:57:11,865] Trial 1 finished with value: 0.7791137912869454 and parameters: {'chunk_size': 2500}. Best is trial 0 with value: 0.7791137912869454.


Output()

[I 2024-11-13 00:57:23,727] Trial 2 finished with value: 0.8015819494922954 and parameters: {'chunk_size': 1500}. Best is trial 2 with value: 0.8015819494922954.


Output()

[I 2024-11-13 00:57:34,770] Trial 3 finished with value: 0.7654710897803307 and parameters: {'chunk_size': 3000}. Best is trial 2 with value: 0.8015819494922954.


[I 2024-11-13 00:57:34,775] Trial 4 finished with value: 0.7654710897803307 and parameters: {'chunk_size': 3000}. Best is trial 2 with value: 0.8015819494922954.


Output()

[I 2024-11-13 00:57:47,331] Trial 5 finished with value: 0.8080389504631361 and parameters: {'chunk_size': 1000}. Best is trial 5 with value: 0.8080389504631361.


[I 2024-11-13 00:57:47,337] Trial 6 finished with value: 0.8015819494922954 and parameters: {'chunk_size': 1500}. Best is trial 5 with value: 0.8080389504631361.


[I 2024-11-13 00:57:47,342] Trial 7 finished with value: 0.8015819494922954 and parameters: {'chunk_size': 1500}. Best is trial 5 with value: 0.8080389504631361.


[I 2024-11-13 00:57:47,348] Trial 8 finished with value: 0.8015819494922954 and parameters: {'chunk_size': 1500}. Best is trial 5 with value: 0.8080389504631361.


[I 2024-11-13 00:57:47,354] Trial 9 finished with value: 0.8080389504631361 and parameters: {'chunk_size': 1000}. Best is trial 5 with value: 0.8080389504631361.


Output()

In [4]:
best_indexer.similarity_search_with_relevance_scores("What is task decomposition")

[(Document(metadata={'source': 'lillog_agents.pdf'}, page_content="A complicated task usually involves many steps. An agent needs to know what they are and plan\n\nahead.\n\nFAQ emojisearch.app\n\nLil'Log\n\nTask Decomposition\n\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for\n\nenhancing model performance on complex tasks. The model is instructed to “think step by step” to\n\nutilize more test-time computation to decompose hard tasks into smaller and simpler steps. CoT\n\ntransforms big tasks into multiple manageable tasks and shed lights into an interpretation of the\n\nmodelʼs thinking process.\n\nTree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at\n\neach step. It first decomposes the problem into multiple thought steps and generates multiple\n\nthoughts per step, creating a tree structure. The search process can be BFS (breadth-first search)\n\nor DFS (depth-first search) with each state evaluated b

## Data Ingest Optimization Demo - Advanced

In [4]:
options_config = {
    "input_source": "uber_10k.pdf",
    "test_dataset": "rag_test_data_1726600284.375674 uber10k_shortlist.csv",  # You'll need to create this with relevant test questions
    
    # Test multiple document loaders
    "document_loaders": [
        {"type": "pymupdf"}, 
        {"type": "unstructured"},
        {"type": "pypdf"}
    ],
    
    # Test different chunking strategies
    "chunking_strategies": [
        {
            "type": "RecursiveCharacterTextSplitter",
            "chunker_kwargs": {"separators": ["\n\n", "\n", " ", ""]}
        },
        {
            "type": "custom",
            "custom_class": "ragbuilder.custom_components.CustomChunker"
        },
    ],
    
    # Test various chunk sizes
    "chunk_size": {
        "min": 500,
        "max": 2000,
        "stepsize": 500
    },
    
    # Test different overlap sizes
    "chunk_overlap": [100],
    
    # Test multiple embedding models
    "embedding_models": [
        {
            "type": "huggingface",
            "model_kwargs": {
                "model_name": "mixedbread-ai/mxbai-embed-large-v1",
            }
        },
        {
            "type": "openai",
            "model_kwargs": {
                "model": "text-embedding-3-large",
            }
        }
    ],
    
    "vector_databases": [
        {
            "type": "chroma",
            "vectordb_kwargs": {
                'persist_directory': 'chroma_sample2',
                'collection_metadata': {'hnsw:space': 'cosine'}
            }
        }
    ],
    
    # Optimization settings
    "optimization": {
        "n_trials": 30, 
        "n_jobs": 1,
        "study_name": "lillog_agents_study",
        "optimization_direction": "maximize"
    },
    
    # Evaluation settings
    "evaluation_config": {
        "type": "similarity",
        "evaluator_kwargs": {
            "top_k": 3,
            "relevance_threshold": 0.2,
            "position_weights": [1.0, 0.5, 0.3]  # More weight to top results
        }
    },
    
    # Logging configuration
    "log_config": {
        "log_level": 20,  # INFO level
        "show_progress_bar": True,
        "verbose": True
    },
    
    # Enable database logging
    "database_logging": True,
    "database_path": "lillog_agents_eval.db"
}

In [5]:
best_config, best_score, best_indexer = data_ingest.run_optimization_from_dict(options_config)

[I 2024-11-12 18:56:47,227] A new study created in memory with name: lillog_agents_study


  0%|          | 0/30 [00:00<?, ?it/s]

Output()

[I 2024-11-12 18:57:11,480] Trial 0 finished with value: 0.6989110708236694 and parameters: {'document_loader_index': 2, 'chunking_strategy_index': 0, 'chunk_size': 1000, 'embedding_model_index': 1}. Best is trial 0 with value: 0.6989110708236694.


Output()

[I 2024-11-12 18:57:45,225] Trial 1 finished with value: 0.7177550182456062 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 0, 'chunk_size': 1500, 'embedding_model_index': 1}. Best is trial 1 with value: 0.7177550182456062.


Output()

[I 2024-11-12 18:58:14,515] Trial 2 finished with value: 0.665955542571961 and parameters: {'document_loader_index': 2, 'chunking_strategy_index': 0, 'chunk_size': 500, 'embedding_model_index': 1}. Best is trial 1 with value: 0.7177550182456062.


[I 2024-11-12 18:58:14,524] Trial 3 finished with value: 0.6989110708236694 and parameters: {'document_loader_index': 2, 'chunking_strategy_index': 0, 'chunk_size': 1000, 'embedding_model_index': 1}. Best is trial 1 with value: 0.7177550182456062.


Output()

[I 2024-11-12 18:58:44,326] Trial 4 finished with value: 0.7334579535506522 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 0, 'chunk_size': 2000, 'embedding_model_index': 1}. Best is trial 4 with value: 0.7334579535506522.


class_path:  ragbuilder.custom_components.CustomChunker


Output()

[I 2024-11-12 18:59:44,982] Trial 5 finished with value: 0.8228296845678299 and parameters: {'document_loader_index': 2, 'chunking_strategy_index': 1, 'chunk_size': 500, 'embedding_model_index': 0}. Best is trial 5 with value: 0.8228296845678299.


Output()

class_path:  ragbuilder.custom_components.CustomChunker


[I 2024-11-12 18:59:58,575] Trial 6 finished with value: 0.7381601147392043 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 1500, 'embedding_model_index': 1}. Best is trial 5 with value: 0.8228296845678299.


Output()

[I 2024-11-12 19:00:13,054] Trial 7 finished with value: 0.7003594400390746 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 0, 'chunk_size': 1000, 'embedding_model_index': 1}. Best is trial 5 with value: 0.8228296845678299.


class_path:  ragbuilder.custom_components.CustomChunker


Output()

[I 2024-11-12 19:01:21,207] Trial 8 finished with value: 0.8365160075003434 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 1000, 'embedding_model_index': 0}. Best is trial 8 with value: 0.8365160075003434.


Output()

class_path:  ragbuilder.custom_components.CustomChunker


[I 2024-11-12 19:01:44,429] Trial 9 finished with value: 0.7487705385359636 and parameters: {'document_loader_index': 2, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 1}. Best is trial 8 with value: 0.8365160075003434.


class_path:  ragbuilder.custom_components.CustomChunker


Output()

[I 2024-11-12 19:02:54,705] Trial 10 finished with value: 0.827746057321155 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 500, 'embedding_model_index': 0}. Best is trial 8 with value: 0.8365160075003434.


[I 2024-11-12 19:02:54,748] Trial 11 finished with value: 0.827746057321155 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 500, 'embedding_model_index': 0}. Best is trial 8 with value: 0.8365160075003434.


[I 2024-11-12 19:02:54,780] Trial 12 finished with value: 0.827746057321155 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 500, 'embedding_model_index': 0}. Best is trial 8 with value: 0.8365160075003434.


[I 2024-11-12 19:02:54,802] Trial 13 finished with value: 0.8365160075003434 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 1000, 'embedding_model_index': 0}. Best is trial 8 with value: 0.8365160075003434.


class_path:  ragbuilder.custom_components.CustomChunker


Output()

[I 2024-11-12 19:04:01,057] Trial 14 finished with value: 0.8374550926422355 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 1500, 'embedding_model_index': 0}. Best is trial 14 with value: 0.8374550926422355.


[I 2024-11-12 19:04:01,093] Trial 15 finished with value: 0.8374550926422355 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 1500, 'embedding_model_index': 0}. Best is trial 14 with value: 0.8374550926422355.


[I 2024-11-12 19:04:01,105] Trial 16 finished with value: 0.8374550926422355 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 1500, 'embedding_model_index': 0}. Best is trial 14 with value: 0.8374550926422355.


class_path:  ragbuilder.custom_components.CustomChunker


Output()

[I 2024-11-12 19:04:59,751] Trial 17 finished with value: 0.8340479097262976 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 1500, 'embedding_model_index': 0}. Best is trial 14 with value: 0.8374550926422355.


class_path:  ragbuilder.custom_components.CustomChunker


Output()

[I 2024-11-12 19:06:33,349] Trial 18 finished with value: 0.8353207516048821 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 14 with value: 0.8374550926422355.


[I 2024-11-12 19:06:33,499] Trial 19 finished with value: 0.8374550926422355 and parameters: {'document_loader_index': 1, 'chunking_strategy_index': 1, 'chunk_size': 1500, 'embedding_model_index': 0}. Best is trial 14 with value: 0.8374550926422355.


class_path:  ragbuilder.custom_components.CustomChunker


Output()

[I 2024-11-12 19:07:43,522] Trial 20 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,552] Trial 21 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,561] Trial 22 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,571] Trial 23 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,581] Trial 24 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,590] Trial 25 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,598] Trial 26 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,651] Trial 27 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


[I 2024-11-12 19:07:43,693] Trial 28 finished with value: 0.8444499783929648 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 1, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


Output()

[I 2024-11-12 19:08:50,115] Trial 29 finished with value: 0.8400242849001808 and parameters: {'document_loader_index': 0, 'chunking_strategy_index': 0, 'chunk_size': 2000, 'embedding_model_index': 0}. Best is trial 20 with value: 0.8444499783929648.


Output()

In [6]:
best_indexer.similarity_search_with_relevance_scores("What was the financial impact of the sale of Uber's ATG Business to Aurora in the first quarter of 2021?")

[(Document(metadata={'source': 'uber_10k.pdf'}, page_content="We entered into a commercial agreement with Aurora pursuant to which the parties will collaborate with best efforts to launch and commercialize self-driving vehicles on our ridesharing network. We also allowed unvested RSUs for Uber stock held by employees of the ATG Business that transferred to Aurora to continue to vest over the next 12 months contingent upon the employee remaining at Aurora. As a result, we recognized liabilities of $315 million as consideration for these future obligations to Aurora.\n\nThe sale of the ATG Business did not represent a strategic shift that would have had a major effect on our operations and financial results, and therefore does not qualify for reporting as a discontinued operation. The resulting gain on disposal was recorded in other income (expense), net in the consolidated statements of operations.\n\nThe following table presents the gain on sale of the ATG Business (in millions):\n\nYe