## Document Functions

In [1]:
import requests
import json
import time

def get_document_data(
    document_url: str,
    document_type: str
) -> any:
    data = None
    response = requests.get(
        url = document_url
    )
    if response.status_code == 200:
        if document_type == 'text':
            data = response.text
        if document_type == 'json':
            data = json.loads(response.text)
        # handle html later
    return data

def scrape_documents(
    url_list: any,
    timeout: int
) -> any:
    documents = []

    text_files = [
        'py',
        'md',
        'yaml',
        'sh'
    ]

    json_files = [
        'ipynb'
    ]
    index = 0
    for url in url_list:
        document = {
            'name': '',
            'data': ''
        }
        url_split = url.split('/')
        if 'github' in url_split[2]:
            if 'raw' in url_split[2]:
                file_end = url_split[-1].split('.')[-1]
                document['name'] = url_split[-1]
                if file_end in text_files:
                    document['data'] = get_document_data(
                        document_url = url,
                        document_type = 'text' 
                    )
                if file_end in json_files:
                    document['data'] = get_document_data(
                        document_url = url,
                        document_type = 'json' 
                    )
        documents.append(document)
        index = index + 1
        if index < len(url_list):
            time.sleep(timeout)
    return documents

# Scraping

In [2]:
wanted_urls = [
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/experiments/article/cloud-hpc/Cloud-HPC-FMNIST-Experiment.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/backend/functions/platforms/celery.py',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/frontend/functions/platforms/redis.py',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/deployment/monitoring/kustomization.yaml',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/deployment/production/stack.yaml'
]

In [2]:
wanted_urls = [
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/backend/functions/platforms/celery.py',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/deployment/production/stack.yaml'
]

In [3]:
scraped_documents = scrape_documents(
    url_list = wanted_urls,
    timeout = 5
)

## Parsing Functions

In [2]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
import re

def tree_extract_imports(
    node: any, 
    code_text: str
) -> any:
    imports = []
    if node.type == 'import_statement' or node.type == 'import_from_statement':
        start_byte = node.start_byte
        end_byte = node.end_byte
        imports.append(code_text[start_byte:end_byte].decode('utf8'))
    for child in node.children:
        imports.extend(tree_extract_imports(child, code_text))
    return imports

def tree_extract_dependencies(
    node: any, 
    code_text: str
) -> any:
    dependencies = []
    for child in node.children:
        if child.type == 'call':
            dependency_name = child.child_by_field_name('function').text.decode('utf8')
            dependencies.append(dependency_name)
        dependencies.extend(tree_extract_dependencies(child, code_text))
    return dependencies

def tree_extract_code_and_dependencies(
    node: any,
    code_text: str
) -> any:
    codes = []
    if not node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name')
        if name is None:
            code = code_text[start_byte:end_byte].decode('utf8')
            if not 'def' in code:
                dependencies = tree_extract_dependencies(node, code_text)
                codes.append({
                    'name': 'global',
                    'code': code,
                    'dependencies': dependencies
                })
    return codes

def tree_extract_functions_and_dependencies(
    node: any, 
    code_text: str
) -> any:
    functions = []
    if node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name').text.decode('utf8')
        code = code_text[start_byte:end_byte].decode('utf8')
        dependencies = tree_extract_dependencies(node, code_text)
        functions.append({
            'name': name,
            'code': code,
            'dependencies': dependencies
        })
    for child in node.children:
        functions.extend(tree_extract_functions_and_dependencies(child, code_text))
    return functions

def tree_get_used_imports(
    general_imports: any,
    function_dependencies: any
) -> any:
    parsed_imports = {}
    for code_import in general_imports:
        import_factors = code_import.split('import')[-1].replace(' ', '')
        import_factors = import_factors.split(',')
    
        for factor in import_factors:
            if not factor in parsed_imports:
                parsed_imports[factor] = code_import.split('import')[0] + 'import ' + factor
            
    relevant_imports = {}
    for dependency in function_dependencies:
        initial_term = dependency.split('.')[0]
    
        if not initial_term in relevant_imports:
            if initial_term in parsed_imports:
                relevant_imports[initial_term] = parsed_imports[initial_term]
    
    used_imports = []
    for name, code in relevant_imports.items():
        used_imports.append(code)

    return used_imports

def tree_get_used_functions(
    general_functions: any,
    function_dependencies: any
): 
    used_functions = []
    for related_function_name in function_dependencies:
        for function in general_functions:
            if function['name'] == related_function_name:
                used_functions.append('from ice import ' + function['name'])
    return used_functions

def tree_create_code_document(
    code_imports: any,
    code_functions: any,
    function_item: any
) -> any:
    used_imports = tree_get_used_imports(
        general_imports = code_imports,
        function_dependencies = function_item['dependencies']
    )

    used_functions = tree_get_used_functions(
        general_functions = code_functions,
        function_dependencies = function_item['dependencies']
    )
    
    document = {
        'imports': used_imports,
        'functions': used_functions,
        'name': function_item['name'],
        'dependencies': function_item['dependencies'],
        'code': function_item['code']
    }
    
    return document
     
def tree_format_code_document(
    code_document: any
) -> any:
    formatted_document = ''
    for doc_import in code_document['imports']:
        formatted_document += doc_import + '\n'

    for doc_functions in code_document['functions']:
        formatted_document += doc_functions + '\n'

    if 0 < len(code_document['dependencies']):
        formatted_document += 'code dependencies\n'

        for doc_dependency in code_document['dependencies']:
            formatted_document += doc_dependency + '\n'

    if code_document['name'] == 'global':
        formatted_document += code_document['name'] + ' code\n'
    else:
        formatted_document += 'function ' + code_document['name'] + ' code\n'
    
    for line in code_document['code'].splitlines():
        if not bool(line.strip()):
            continue
        doc_code = re.sub(r'#.*','', line)
        if not bool(doc_code.strip()):
            continue
        formatted_document += doc_code + '\n'    
    return formatted_document

def tree_create_python_code_and_function_documents(
    code_document: any
):
    PY_LANGUAGE = Language(tspython.language())
    parser = Parser(PY_LANGUAGE)
   
    tree = parser.parse(
        bytes(
            code_document,
            "utf8"
        )
    )

    root_node = tree.root_node
    code_imports = tree_extract_imports(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    code_global = tree_extract_code_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    code_functions = tree_extract_functions_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )
    
    initial_documents = []
    for item in code_global:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    for item in code_functions:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    formatted_documents = []
    seen_functions = []
    for document in initial_documents:
        if not document['name'] == 'global':
            if document['name'] in seen_functions:
                continue
        
        formatted_document = tree_format_code_document(
            code_document = document
        )

        formatted_documents.append(formatted_document)
        seen_functions.append(document['name'])
    return formatted_documents

## Document Functions

In [3]:
import nbformat
from bs4 import BeautifulSoup
import markdown

def extract_jupyter_notebook_markdown_and_code(
    notebook_document: any
): 
    notebook_documents = {
        'markdown': [],
        'code': []
    }

    notebook = nbformat.from_dict(notebook_document)

    index = 1
    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            notebook_documents['markdown'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
        if cell.cell_type == 'code':
            notebook_documents['code'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
    
    return notebook_documents
    
def parse_markdown_into_text(
    markdown_text: any
) -> any:
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, features='html.parser')
    text = soup.get_text()
    code_block_pattern = re.compile(r"```")
    text = re.sub(code_block_pattern, '', text)
    text = text.rstrip('\n')
    text = text.replace('\nsh', '\n')
    text = text.replace('\nbash', '\n')
    return text

def create_python_documents(
    python_document: any
): 
    joined_code = ''.join(python_document)
    block_code_documents = tree_create_python_code_and_function_documents(
        code_document = joined_code
    )

    code_documents = []
    seen_function_names = []
    code_doc_index = 0
    for code_doc in block_code_documents:
        row_split = code_doc.split('\n')
        for row in row_split:
            if 'function' in row and 'code' in row:
                function_name = row.split(' ')[1]
                if not function_name in seen_function_names:
                    seen_function_names.append(function_name)
                else:
                    del block_code_documents[code_doc_index]
        code_doc_index += 1

    if 0 < len(block_code_documents):
        index = 1
        for code_doc in block_code_documents:
            code_documents.append({
                'index': index,
                'data': code_doc
            })
            index += 1
        
    formatted_documents = {
        'code': code_documents
    }
    return formatted_documents

def create_notebook_documents(
    notebook_document: any
):
    notebook_documents = extract_jupyter_notebook_markdown_and_code(
        notebook_document = notebook_document
    )

    markdown_documents = []
    for block in notebook_documents['markdown']:
        joined_text = ''.join(block['data'])
        markdown_text = parse_markdown_into_text(
            markdown_text = joined_text
        )
        markdown_documents.append({
            'index': block['id'],
            'data': markdown_text
        })
        
    code_documents = []
    seen_function_names = []
    for block in notebook_documents['code']:
        joined_code = ''.join(block['data'])
        block_code_documents = tree_create_python_code_and_function_documents(
            code_document = joined_code
        )

        code_doc_index = 0
        for code_doc in block_code_documents:
            row_split = code_doc.split('\n')
            for row in row_split:
                if 'function' in row and 'code' in row:
                    function_name = row.split(' ')[1]
                    if not function_name in seen_function_names:
                        seen_function_names.append(function_name)
                    else:
                        del block_code_documents[code_doc_index]
            code_doc_index += 1
        
        if 0 < len(block_code_documents):
            sub_indexes = False
            if 1 < len(block_code_documents):
                sub_indexes = True
            index = 1
            for code_doc in block_code_documents:
                if sub_indexes:
                    code_documents.append({
                        'sub-index': index, 
                        'index': block['id'],
                        'data': code_doc
                    })
                else:
                    code_documents.append({ 
                        'index': block['id'],
                        'data': code_doc
                    })
                index += 1
            
    formatted_documents = {
        'markdown': markdown_documents,
        'code': code_documents
    }
    
    return formatted_documents

## Mongo Functions

In [66]:
from pymongo import MongoClient as mc

def mongo_is_client(
    storage_client: any
) -> any:
    return isinstance(storage_client, mc.Connection)

def mongo_setup_client(
    username: str,
    password: str,
    address: str,
    port: str
) -> any:
    connection_prefix = 'mongodb://(username):(password)@(address):(port)/'
    connection_address = connection_prefix.replace('(username)', username)
    connection_address = connection_address.replace('(password)', password)
    connection_address = connection_address.replace('(address)', address)
    connection_address = connection_address.replace('(port)', port)
    mongo_client = mc(
        host = connection_address
    )
    return mongo_client

def mongo_get_database(
    mongo_client: any,
    database_name: str
) -> any:
    try:
        database = mongo_client[database_name]
        return database
    except Exception as e:
        return None

def mongo_check_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database_exists = database_name in mongo_client.list_database_names()
        return database_exists
    except Exception as e:
        return False

def mongo_list_databases(
    mongo_client: any
) -> any:
    try:
        databases = mongo_client.list_database_names()
        return databases
    except Exception as e:
        return []

def mongo_remove_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        mongo_client.drop_database(database_name)
        return True
    except Exception as e:
        return False

def mongo_get_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collection = database[collection_name]
        return collection
    except Exception as e:
        return None
    
def mongo_check_collection(
    mongo_client: any, 
    database_name: any, 
    collection_name: any
) -> bool:
    try:
        database = mongo_client[database_name]
        collection_exists = collection_name in database.list_collection_names()
        return collection_exists
    except Exception as e:
        return False

def mongo_update_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any, 
    update_query: any
) -> any:
    try:
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_many(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_list_collections(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collections = database.list_collection_names()
        return collections
    except Exception as e:
        return []

def mongo_remove_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try: 
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        database.drop_collection(collection_name)
        return True
    except Exception as e:
        return False

def mongo_create_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    document: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.insert_one(document)
        return result
    except Exception as e:
        return None

def mongo_get_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
):
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        document = collection.find_one(filter_query)
        return document
    except Exception as e:
        print(e)
        return None 

def mongo_list_documents(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any,
    sorting_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        documents = list(collection.find(filter_query).sort(sorting_query))
        return documents
    except Exception as e:
        return []

def mongo_update_document(
    mongo_client: any, 
    database_name: any, 
    collection_name: any, 
    filter_query: any, 
    update_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_one(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_remove_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
) -> bool:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.delete_one(filter_query)
        return result
    except Exception as e:
        return None

# Storage

In [5]:
def store_scraped_documents(
    mongo_client: any,
    database_prefix: str,
    documents: any
):
    for document in documents:
        file_name = document['name']
        file_data = document['data']
        
        formatted_documents = {}
        if '.ipynb' in file_name:
            formatted_documents = create_notebook_documents(
                notebook_document = file_data
            )
            document_database_name = database_prefix + '-workflows'
        if '.py' in file_name:
            formatted_documents = create_python_documents(
                python_document = file_data
            )
            document_database_name = database_prefix + '-code'
        
        for doc_type, doc_data in formatted_documents.items():
            for document in doc_data:
                document_data = document['data']
                document_index = document['index']
                document_sub_index = 0

                if 'sub-index' in document:
                    document_sub_index = document['sub-index']
                
                result = mongo_create_document(
                    mongo_client = mongo_client,
                    database_name = document_database_name,
                    collection_name = file_name,
                    document = {
                        'index': int(document_index),
                        'sub-index': int(document_sub_index),
                        'type': doc_type,
                        'data': document_data
                    }
                )

In [6]:
mongo_client = mongo_setup_client(
    username = 'mongo123',
    password = 'mongo456',
    address = '127.0.0.1',
    port = '27017'
)

In [9]:
store_scraped_documents(
    mongo_client = mongo_client,
    database_prefix = 'llm-rag',
    documents = scraped_documents
)

## LangChain Functions

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

def langchain_generate_code_document_chunks(
    language: any,
    chunk_size: int,
    chunk_overlap: int,
    document: any
) -> any:
    splitter = RecursiveCharacterTextSplitter.from_language(
        language = language,
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap
    )

    document_chunks = splitter.create_documents([document])
    document_chunks = [doc.page_content for doc in document_chunks]
    return document_chunks

def langchain_generate_document_chunk_embeddings(
    model_name: str,
    document_chunks: any
) -> any:
    embedding_model = HuggingFaceEmbeddings(
        model_name = model_name
    )
    chunk_embeddings = embedding_model.embed_documents(
        texts = document_chunks
    )
    return chunk_embeddings

## Qdrant Functions

In [8]:
from qdrant_client import QdrantClient as qc
from qdrant_client import models

def qdrant_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, qc.Connection)
    except Exception as e:
        return False

def qdrant_setup_client(
    api_key: str,
    address: str, 
    port: str
) -> any:
    try:
        qdrant_client = qc(
            host = address,
            port = int(port),
            api_key = api_key,
            https = False
        ) 
        return qdrant_client
    except Exception as e:
        return None

def qdrant_create_collection(
    qdrant_client: any, 
    collection_name: str,
    configuration: any
) -> any:
    try:
        result = qdrant_client.create_collection(
            collection_name = collection_name,
            vectors_config = configuration
        )
        return result
    except Exception as e:
        return None

def qdrant_get_collection(
    qdrant_client: any, 
    collection_name: str
) -> any:
    try:
        collection = qdrant_client.get_collection(
            collection_name = collection_name
        )
        return collection
    except Exception as e:
        return None

def qdrant_list_collections(
    qdrant_client: any
) -> any:
    try:
        collections = qdrant_client.get_collections()
        collection_list = []
        for description in collections.collections:
            collection_list.append(description.name)
        return collection_list
    except Exception as e:
        return []

def qdrant_remove_collection(
    qdrant_client: any, 
    collection_name: str
) -> bool:
    try:
        qdrant_client.delete_collection(collection_name)
        return True
    except Exception as e:
        return False

def qdrant_upsert_points(
    qdrant_client: qc, 
    collection_name: str,
    points: any
) -> any:
    try:
        results = qdrant_client.upsert(
            collection_name = collection_name, 
            points = points
        )
        return results
    except Exception as e:
        print(e)
        return None

def qdrant_search_data(
    qdrant_client: qc,  
    collection_name: str,
    scroll_filter: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.scroll(
            collection_name = collection_name,
            scroll_filter = scroll_filter,
            limit = limit
        )
        return hits
    except Exception as e:
        print(e)
        return []

def qdrant_search_vectors(
    qdrant_client: qc,  
    collection_name: str,
    query_vector: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.search(
            collection_name = collection_name,
            query_vector = query_vector,
            limit = limit
        )
        return hits
    except Exception as e:
        return []

def qdrant_remove_vectors(
    qdrant_client: qc,  
    collection_name: str, 
    vectors: str
) -> bool:
    try:
        results = qdrant_client.delete_vectors(
            collection_name = collection_name,
            vectors = vectors
        )
        return results
    except Exception as e:
        print(f"Error removing document: {e}")
        return None


# Getting documents

In [9]:
database_prefix = 'llm-rag'
storage_structure = {}
database_list = mongo_list_databases(
    mongo_client = mongo_client
)
for database in database_list:
    if database_prefix in database:
        collection_list = mongo_list_collections(
            mongo_client = mongo_client,
            database_name = database
        )
        storage_structure[database] = collection_list

In [10]:
from pymongo import ASCENDING, DESCENDING

storage_documents = {}
for database, collections in storage_structure.items():
    if not database in storage_documents:
        storage_documents[database] = {}
    for collection in collections:
        collection_documents = mongo_list_documents(
            mongo_client = mongo_client,
            database_name = database,
            collection_name = collection,
            filter_query = {},
            sorting_query = [
                ('index', ASCENDING),
                ('sub-index', ASCENDING)
            ]
        )
        storage_documents[database][collection] = collection_documents

# Vector Embeddings

In [11]:
def langchain_generate_code_document_chunks(
    language: any,
    chunk_size: int,
    chunk_overlap: int,
    document: any
) -> any:
    splitter = RecursiveCharacterTextSplitter.from_language(
        language = language,
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap
    )

    document_chunks = splitter.create_documents([document])
    document_chunks = [doc.page_content for doc in document_chunks]
    return document_chunks

def lanchain_generate_text_document_chunks(
    chunk_size: int,
    chunk_overlap: int,
    document: any
) -> any:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap,
        length_function=len,
        is_separator_regex=False
    )

    document_chunks = splitter.create_documents([document])
    document_chunks = [doc.page_content for doc in document_chunks]
    return document_chunks

def langchain_generate_document_chunk_embeddings(
    model_name: str,
    document_chunks: any
) -> any:
    embedding_model = HuggingFaceEmbeddings(
        model_name = model_name
    )
    chunk_embeddings = embedding_model.embed_documents(
        texts = document_chunks
    )
    return chunk_embeddings

In [17]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

def generate_document_vector_packet(
    document: any,
    configuration: any,
) -> any:
    document_type = document['type']
    used_configuration = configuration[document_type]
    
    document_chunks = []
    if document_type == 'code':
        document_chunks = langchain_generate_code_document_chunks(
            language = Language.PYTHON,
            chunk_size = used_configuration['chunk-size'],
            chunk_overlap = used_configuration['chunk-overlap'],
            document = document['data']
        )
    if document_type == 'markdown':
        document_chunks = lanchain_generate_text_document_chunks(
            chunk_size = used_configuration['chunk-size'],
            chunk_overlap = used_configuration['chunk-overlap'],
            document = document['data']
        )
        
    vector_embedding = langchain_generate_document_chunk_embeddings(
        model_name = used_configuration['model-name'],
        document_chunks = document_chunks
    )

    packet = {
        'chunks': document_chunks,
        'embeddings': vector_embedding
    }
    
    return packet

In [18]:
qdrant_client = qdrant_setup_client(
    api_key = 'qdrant_key',
    address = '127.0.0.1', 
    port = '6333'
)



In [65]:
from qdrant_client import QdrantClient, models
qdrant_client.scroll(
    collection_name = 'llm-rag-workflows-embeddings',
    scroll_filter = models.Filter(
        must = [
            models.FieldCondition(
                key='chunk_hash',
                match = models.MatchValue(value="3c177a2a09134a89280e93580f24d2d9")
            )
        ]
    ),
    limit = 5
)

([Record(id='005732c9-4ce0-5230-bb85-96de0b7b6c1c', payload={'database': 'llm-rag-workflows', 'collection': 'demo-pipeline.ipynb', 'id': '671b31b2f12238512f2cafd1', 'chunk': 'if len(resp.history) == 0:', 'chunk_hash': '3c177a2a09134a89280e93580f24d2d9'}, vector=None, shard_key=None, order_value=None),
  Record(id='b7416704-cf40-521d-865d-a4ba79fc56d4', payload={'database': 'llm-rag-workflows', 'collection': 'demo-pipeline.ipynb', 'id': '671b31b2f12238512f2cafd1', 'chunk': 'if len(resp.history) == 0:', 'chunk_hash': '3c177a2a09134a89280e93580f24d2d9'}, vector=None, shard_key=None, order_value=None)],
 None)

In [21]:
qdrant_client = qdrant_setup_client(
    api_key = 'qdrant_key',
    address = '127.0.0.1', 
    port = '6333'
)



In [12]:
import hashlib
import numpy as np
import uuid
import re
from qdrant_client.models import VectorParams, Distance
from qdrant_client.models import PointStruct

# for preventing duplicates
def format_chunk(
    document_chunk: any
) -> any:
    chunk = re.sub(r'[^\w\s]', '', document_chunk)
    chunk = re.sub(r'\s+', ' ', chunk) 
    chunk = chunk.strip()
    chunk = chunk.lower()
    # This helps to remove unique hashes for duplicates such as:
    # task_id = task_id )
    # task_id = task_id 
    # task_id = task_id )
    return chunk

def generate_chunk_hash(
    document_chunk: any
) -> any:
    cleaned_chunk = format_chunk(
        document_chunk = document_chunk
    )
    return hashlib.md5(cleaned_chunk.encode('utf-8')).hexdigest()

vector_configuration = {
    'code': {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2'
    },
    'markdown': {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2'
    }
}

# 
for database, collections in storage_documents.items():
    vector_collection_name = database + '-embeddings'
    for collection, documents in collections.items():
        for document in documents:
            document_id = str(document['_id'])
            document_type = document['type']
            
            vector_packet = generate_document_vector_packet(
                document = document,
                configuration = vector_configuration
            )
            
            document_chunks = vector_packet['chunks']
            document_embeddings = vector_packet['embeddings']
            if 0 < len(document_embeddings):
                vector_collections = qdrant_list_collections(
                    qdrant_client = qdrant_client
                )

                if not vector_collection_name in vector_collections:
                    vector_collection_configuration = VectorParams(
                          size = len(document_embeddings[0]), 
                          distance = Distance.COSINE
                    )
                    collection_created = qdrant_create_collection(
                        qdrant_client = qdrant_client,
                        collection_name = vector_collection_name,
                        configuration = vector_collection_configuration
                    )

                vector_points = []
                vector_index = 0
                added_hashes = []
                for chunk in document_chunks:
                    vector_id = document_id + '-' + str(vector_index + 1)
                    vector_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, vector_id))

                    chunk_hash = generate_chunk_hash(
                        document_chunk = chunk
                    )
                   
                    existing_chunks = qdrant_search_data(
                        qdrant_client = qdrant_client,
                        collection_name = vector_collection_name,
                        scroll_filter = models.Filter(
                            must = [
                                models.FieldCondition(
                                    key = 'chunk_hash',
                                    match = models.MatchValue(
                                        value = chunk_hash
                                    )
                                )
                            ]
                        ),
                        limit = 1
                    )
                    # Removes duplicates
                    if len(existing_chunks[0]) == 0:
                        if not chunk_hash in added_hashes:
                            given_vector = document_embeddings[vector_index]

                            chunk_point = PointStruct(
                                id = vector_uuid, 
                                vector = given_vector,
                                payload = {
                                    'database': database,
                                    'collection': collection,
                                    'document': document_id,
                                    'type': document_type,
                                    'chunk': chunk,
                                    'chunk_hash': chunk_hash
                                }
                            )
                            added_hashes.append(chunk_hash)
                            vector_points.append(chunk_point)
                    vector_index += 1

                if 0 < len(vector_points):
                    points_stored = qdrant_upsert_points(
                        qdrant_client = qdrant_client, 
                        collection_name = vector_collection_name,
                        points = vector_points
                    )

NameError: name 'generate_document_vector_packet' is not defined

## SpaCy functions

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm")

def get_document_keywords(
    document: str
):
    doc = nlp(document.lower())
    
    keywords = [
        token.lemma_ for token in doc
        if not token.is_stop               
        and not token.is_punct              
        and not token.is_space              
        and len(token) > 1                  
    ]
    
    keywords = list(set(keywords))
    
    return keywords

## NLTK Functions

In [89]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt_tab')
nltk.download('stopwords')

def get_document_keywords(
    document: any
):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    text = document.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 1]
    
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = list(dict.fromkeys(tokens))

    return tokens

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Meili Functions

In [14]:
import meilisearch as ms

def meili_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, ms.Connection)
    except Exception as e:
        print(e)
        return False

def meili_setup_client(
    host: str, 
    api_key: str
) -> any:
    try:
        meili_client = ms.Client(
            url = host, 
            api_key = api_key
        )
        return meili_client 
    except Exception as e:
        print(e)
        return None

def meili_get_index( 
    meili_client: any, 
    index_name: str
) -> any:
    try:
        index = meili_client.index(
            uid = index_name
        )
        return index
    except Exception as e:
        print(e)
        return None
    
def meili_check_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        meili_client.get_index(
            uid = index_name
        )
        return True
    except Exception as e:
        print(e)
        return False
    
def meili_remove_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        response = meili_client.index(
            index_name = index_name
        ).delete()
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_list_indexes(
    meili_client: any
) -> bool:
    try:
        indexes = meili_client.get_indexes()
        return indexes
    except Exception as e:
        print(e)
        return None

def meili_add_documents(
    meili_client: any, 
    index_name: str, 
    documents: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.add_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_set_filterable(
    meili_client: any, 
    index_name: str, 
    attributes: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.update_filterable_attributes(attributes)
        return response
    except Exception as e:
        print(e)
        return None

def meili_search_documents(
    meili_client: any, 
    index_name: str, 
    query: any, 
    options: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.search(
            query,
            options
        )
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_update_documents(
    meili_client, 
    index_name, 
    documents
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.update_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_delete_documents(
    meili_client: any, 
    index_name: str, 
    ids: any
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.delete_documents(
            document_ids = ids
        )
        return response
    except Exception as e:
        print(e)
        return None

## Keywords

In [15]:
meili_client = meili_setup_client(
    host = 'http://127.0.0.1:7700', 
    api_key = 'meili_key'
)

In [16]:
import uuid

for database, collections in storage_documents.items():
    keyword_collection_name = database + '-keywords'
    keyword_documents = []
    for collection, documents in collections.items():
        document_index = 0
        for document in documents:
            document_id = str(document['_id'])
            document_data = document['data']
            document_type = document['type']
            
            document_keywords = get_document_keywords(
                document = document_data
            )
            
            keyword_id = document_id + '-' + str(document_index + 1)
            keyword_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, keyword_id))

            payload = {
                'id': keyword_uuid,
                'database': database,
                'collection': collection,
                'document': document_id,
                'type': document_type,
                'keywords': document_keywords
            }

            keyword_documents.append(payload)
        
    stored = meili_add_documents(
        meili_client = meili_client,
        index_name = keyword_collection_name,
        documents = keyword_documents
    )

## Hybrid Search

In [75]:
def clean_prompt(
    prompt: str
) -> any:
    prompt = prompt.lower()
    prompt = re.sub(r'\s+', ' ', prompt)
    prompt = re.sub(r'[^\w\s]', '', prompt)
    return prompt.strip()

def generate_prompt_embedding_query(
    model_name: str,
    prompt: any
) -> any:
    embedding_model = HuggingFaceEmbeddings(
        model_name = model_name
    )
    embedding = embedding_model.embed_documents(
        texts = [prompt]
    )
    return embedding[0]

def spacy_find_keywords(
    text: str
):
    formatted = nlp(text.lower())
    
    keywords = [
        token.lemma_ for token in formatted
        if not token.is_stop               
        and not token.is_punct              
        and not token.is_space              
        and len(token) > 1                  
    ]
    
    keywords = list(set(keywords))
    
    return keywords

def generate_prompt_keyword_query(
    prompt: any
) -> any:
    keywords = spacy_find_keywords(
        text = prompt
    )
    keyword_query = ' OR '.join([f'keywords = "{keyword}"' for keyword in keywords])
    return keyword_query

def calculate_keyword_score(
    keyword_query: str,
    keyword_list: any
) -> any:
    match = 0
    asked_keywords = keyword_query.split('OR')
    for asked_keyword in asked_keywords:
        formatted = asked_keyword.replace('keywords =', '')
        formatted = formatted.replace('"', '')
        formatted = formatted.replace(' ', '')
        
        if formatted in keyword_list:
            match += 1
            
    query_length = len(asked_keywords)
    keyword_length = len(keyword_list)

    if match == 0:
        return 0.0

    normalized = match / ((query_length * keyword_length) ** 0.5)
    return normalized
    
def vector_search_collection(
    vector_client: any,
    search_client: any,
    prompt: str,
    top_k: int
):

    cleaned_prompt = clean_prompt(
        prompt = prompt
    )

    prompt_embedding_query = generate_prompt_embedding_query(
        model_name = 'sentence-transformers/all-MiniLM-L6-v2',
        prompt = cleaned_prompt
    )

    prompt_keyword_query = generate_prompt_keyword_query(
        prompt = cleaned_prompt
    )

    qdrant_collections = [
        'llm-rag-code-embeddings',
        'llm-rag-workflows-embeddings'
    ]
    
    recommeded_cases = []
    for collection in qdrant_collections:
        results = qdrant_search_vectors(
            qdrant_client = vector_client,  
            collection_name = collection,
            query_vector = prompt_embedding_query,
            limit = top_k
        ) 
        
        for result in results:
            res_database = result.payload['database']
            res_collection = result.payload['collection']
            res_document = result.payload['document']
            res_type = result.payload['type']
            res_score = result.score
            
            res_case = {
                'source': 'vector',
                'database': res_database,
                'collection': res_collection,
                'document': res_document,
                'type': res_type,
                'score': res_score
            }
            
            recommeded_cases.append(res_case)
            
    meili_collections = [
        'llm-rag-code-keywords',
        'llm-rag-workflows-keywords'
    ]

    recommeded_keyword_cases = []
    for index in meili_collections:
        results = meili_search_documents(
            meili_client = search_client, 
            index_name = index, 
            query = "", 
            options = {
                'filter': prompt_keyword_query,
                'attributesToRetrieve': ['database','collection','document', 'keywords'],
                'limit': top_k
            }
        )

        for result in results['hits']:
            res_database = result['database']
            res_collection = result['collection']
            res_document = result['document']
            res_keywords = result['keywords']
            
            res_score = calculate_keyword_score(
                keyword_query = prompt_keyword_query,
                keyword_list = res_keywords
            )

            res_case = {
                'source': 'search',
                'database': res_database,
                'collection': res_collection,
                'document': res_document,
                'type': res_type,
                'score': res_score
            }

            recommeded_cases.append(res_case)

    return recommeded_cases

def get_top_document_metadata(
    collection: str,
    alpha: float
) -> any:
    df = pd.DataFrame(collection)
    ids_with_both = df.groupby('document')['source'].nunique()
    ids_with_both = ids_with_both[ids_with_both > 1].index
    filtered_df = df[df['document'].isin(ids_with_both)]

    matched_documents = []
    for index_i, row_i in filtered_df[filtered_df['source'] == 'vector'].iterrows():
        vector_source = row_i['source']
        vector_database = row_i['database']
        vector_collection = row_i['collection']
        vector_id = row_i['document']
        vector_type = row_i['type']
        vector_score = row_i['score']
        
        for index_j, row_j in filtered_df[filtered_df['source'] == 'search'].iterrows():
            search_source = row_j['source']
            search_database = row_j['database']
            search_collection = row_j['collection']
            search_id = row_j['document']
            search_type = row_j['type']
            search_score = row_j['score']
            
            if vector_database == search_database:
                if vector_collection == search_collection:
                    if vector_type == search_type:
                        if vector_id == search_id:
                            hybrid_score = vector_score * alpha + search_score * (1-alpha)
    
                            matched_documents.append({
                                'source': 'hybrid',
                                'database': search_database,
                                'collection': search_collection,
                                'document': search_id,
                                'score': hybrid_score
                            })
    
    match_df = pd.DataFrame(matched_documents)
    print(match_df)
    return match_df.nlargest(1, 'score').values.tolist()[0]

In [19]:
example_prompt = 'Can you generate a kfp training component for cloud-HPC pipeline?'

In [22]:
hit_collection = vector_search_collection(
    vector_client = qdrant_client,
    search_client = meili_client,
    prompt = example_prompt,
    top_k = 5
)

  from tqdm.autonotebook import tqdm, trange


In [76]:
top_document_metadata = get_top_document_metadata(
    collection = hit_collection,
    alpha = 0.5
)

   source           database           collection                  document  \
0  hybrid  llm-rag-workflows  demo-pipeline.ipynb  671b31b2f12238512f2cafd2   
1  hybrid  llm-rag-workflows  demo-pipeline.ipynb  671b31b2f12238512f2cafd2   

      score  
0  0.329265  
1  0.295031  


In [63]:
top_document_metadata

['hybrid',
 'llm-rag-workflows',
 'demo-pipeline.ipynb',
 '671b31b2f12238512f2cafd2',
 0.32926466398892446]

In [73]:
from bson.objectid import ObjectId
case_object_id = ObjectId(top_document_metadata[3])
document_query = {'_id': case_object_id}
print(document_query)
rag_document = mongo_get_document(
    mongo_client = mongo_client, 
    database_name = top_document_metadata[1], 
    collection_name = top_document_metadata[2], 
    filter_query = document_query
)

{'_id': ObjectId('671b31b2f12238512f2cafd2')}


In [74]:
rag_document

{'_id': ObjectId('671b31b2f12238512f2cafd2'),
 'index': 8,
 'sub-index': 0,
 'type': 'code',
 'data': 'import kfp\ncode dependencies\nget_istio_auth_session\nkfp.Client\nglobal code\nimport kfp\nKUBEFLOW_ENDPOINT = "http://localhost:8080"\nKUBEFLOW_USERNAME = "user@example.com"\nKUBEFLOW_PASSWORD = "12341234"\nauth_session = get_istio_auth_session(\n    url=KUBEFLOW_ENDPOINT,\n    username=KUBEFLOW_USERNAME,\n    password=KUBEFLOW_PASSWORD\n)\nclient = kfp.Client(host=f"{KUBEFLOW_ENDPOINT}/pipeline", cookies=auth_session["session_cookie"])\n'}

In [25]:
import pandas as pd

df = pd.DataFrame(hit_collection)

In [26]:
df

Unnamed: 0,source,database,collection,document,type,score
0,vector,llm-rag-code,celery.py,671b31b2f12238512f2cafe4,code,0.196113
1,vector,llm-rag-code,celery.py,671b31b2f12238512f2cafe2,code,0.183395
2,vector,llm-rag-code,celery.py,671b31b2f12238512f2cafe5,code,0.181468
3,vector,llm-rag-code,celery.py,671b31b2f12238512f2cafe4,code,0.16365
4,vector,llm-rag-code,celery.py,671b31b2f12238512f2cafe2,code,0.14555
5,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b1f12238512f2cafbc,markdown,0.684614
6,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.571491
7,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafcb,markdown,0.522507
8,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.503023
9,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafdc,code,0.501056


In [34]:
ids_with_both = df.groupby('document')['source'].nunique()
ids_with_both = ids_with_both[ids_with_both > 1].index
filtered_df = df[df['document'].isin(ids_with_both)]

In [39]:
filtered_df

Unnamed: 0,source,database,collection,document,type,score
5,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b1f12238512f2cafbc,markdown,0.684614
6,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.571491
8,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.503023
10,search,llm-rag-workflows,demo-pipeline.ipynb,671b31b1f12238512f2cafbc,code,0.471405
12,search,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.087039


In [48]:
alpha = 0.5

matched_documents = []
for index_i, row_i in filtered_df[filtered_df['source'] == 'vector'].iterrows():
    vector_source = row_i['source']
    vector_database = row_i['database']
    vector_collection = row_i['collection']
    vector_id = row_i['document']
    vector_type = row_i['type']
    vector_score = row_i['score']
    
    for index_j, row_j in filtered_df[filtered_df['source'] == 'search'].iterrows():
        search_source = row_j['source']
        search_database = row_j['database']
        search_collection = row_j['collection']
        search_id = row_j['document']
        search_type = row_j['type']
        search_score = row_j['score']
        
        if vector_database == search_database:
            if vector_collection == search_collection:
                if vector_type == search_type:
                    if vector_id == search_id:
                        hybrid_score = vector_score * alpha + search_score * (1-alpha)

                        matched_documents.append({
                            'source': 'hybrid',
                            'database': search_database,
                            'collection': search_collection,
                            'document': search_id,
                            'score': hybrid_score
                        })
                        

0.32926466398892446
0.29503091398892445


In [38]:
#pivot_df = filtered_df.pivot(index = 'document', columns = 'source', values = 'score')

#print(pivot_df)

#pivot_df = pivot_df.fillna(0)
#pivot_df['hybrid_score'] = (pivot_df['vector'] * 0.7) + (pivot_df['search'] * 0.3)
#pivot_df[['document','vector']]

ValueError: Index contains duplicate entries, cannot reshape

In [35]:
filtered_df

Unnamed: 0,source,database,collection,document,type,score
5,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b1f12238512f2cafbc,markdown,0.684614
6,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.571491
8,vector,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.503023
10,search,llm-rag-workflows,demo-pipeline.ipynb,671b31b1f12238512f2cafbc,code,0.471405
12,search,llm-rag-workflows,demo-pipeline.ipynb,671b31b2f12238512f2cafd2,code,0.087039


In [24]:
hit_collection

[{'source': 'vector',
  'database': 'llm-rag-code',
  'collection': 'celery.py',
  'document': '671b31b2f12238512f2cafe4',
  'type': 'code',
  'score': 0.19611284},
 {'source': 'vector',
  'database': 'llm-rag-code',
  'collection': 'celery.py',
  'document': '671b31b2f12238512f2cafe2',
  'type': 'code',
  'score': 0.18339549},
 {'source': 'vector',
  'database': 'llm-rag-code',
  'collection': 'celery.py',
  'document': '671b31b2f12238512f2cafe5',
  'type': 'code',
  'score': 0.18146801},
 {'source': 'vector',
  'database': 'llm-rag-code',
  'collection': 'celery.py',
  'document': '671b31b2f12238512f2cafe4',
  'type': 'code',
  'score': 0.16364963},
 {'source': 'vector',
  'database': 'llm-rag-code',
  'collection': 'celery.py',
  'document': '671b31b2f12238512f2cafe2',
  'type': 'code',
  'score': 0.14554955},
 {'source': 'vector',
  'database': 'llm-rag-workflows',
  'collection': 'demo-pipeline.ipynb',
  'document': '671b31b1f12238512f2cafbc',
  'type': 'markdown',
  'score': 0.68

In [None]:
def search_by_multiple_keywords(keywords):
    # Create a filter for multiple keywords using OR
    filter_expression = ' OR '.join([f'keywords = "{keyword}"' for keyword in keywords])
    results = index.search("", {
        'filters': filter_expression,
        'attributesToRetrieve': ['id', 'title', 'keywords']
    })
    return results

In [82]:
meili_set_filterable(
    meili_client = meili_client, 
    index_name = 'llm-rag-workflows-keywords', 
    attributes = ['keywords']
)

TaskInfo(task_uid=28, index_uid='llm-rag-workflows-keywords', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2024, 10, 28, 10, 27, 49, 585572))

In [None]:
    '''
    
    
    # Step 1: Perform Vector Search in Qdrant
    qdrant_results = qdrant_client.search(
        collection_name=qdrant_index,
        query_vector=query_embedding,
        limit=top_k
    )

    # Extract Qdrant document IDs and vector scores
    qdrant_docs = [
        {
            "id": result.payload["id"],
            "vector_score": result.score
        } for result in qdrant_results
    ]
    qdrant_ids = [doc["id"] for doc in qdrant_docs]

    # Step 2: Perform Keyword Search in MeiliSearch with Filter on Qdrant Results
    meili_results = meili_client.index(meili_index).search(
        prompt,
        {
            "filter": f"id IN [{', '.join(map(str, qdrant_ids))}]",
            "limit": top_k
        }
    )

    # Extract MeiliSearch document IDs and keyword scores
    meili_docs = [
        {
            "id": hit["id"],
            "content": hit,  # Retrieve the full document for final output
            "keyword_score": hit["_rankingScore"]  # MeiliSearch-specific score field
        } for hit in meili_results["hits"]
    ]

    # Step 3: Merge and Score Results Using Weighted Scoring
    # Create a dictionary for combined scoring, merging both Qdrant and MeiliSearch results
    combined_results = {}
    for doc in qdrant_docs:
        combined_results[doc["id"]] = {
            "vector_score": doc["vector_score"],
            "keyword_score": 0,  # Will be updated if present in MeiliSearch
            "content": None
        }
    for doc in meili_docs:
        if doc["id"] in combined_results:
            combined_results[doc["id"]]["keyword_score"] = doc["keyword_score"]
            combined_results[doc["id"]]["content"] = doc["content"]

    # Calculate the final score using the weighted sum of vector and keyword scores
    for doc_id, scores in combined_results.items():
        scores["combined_score"] = alpha * scores["vector_score"] + (1 - alpha) * scores["keyword_score"]

    # Step 4: Sort by Combined Score and Return Results
    sorted_results = sorted(combined_results.values(), key=lambda x: x["combined_score"], reverse=True)

    # Return the top results with their content
    return [doc["content"] for doc in sorted_results if doc["content"]]
    '''

In [106]:
index = meili_client.index('test')

In [107]:
index.add_documents(keyword_documents)

TaskInfo(task_uid=7, index_uid='test', status='enqueued', type='documentAdditionOrUpdate', enqueued_at=datetime.datetime(2024, 10, 25, 12, 4, 54, 993824))

In [108]:
meili_client.index('test').delete()

TaskInfo(task_uid=8, index_uid='test', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 10, 25, 12, 10, 17, 391692))

In [18]:
meili_client.index('llm-rag-workflows-keywords').delete()

TaskInfo(task_uid=23, index_uid='llm-rag-workflows-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 10, 28, 9, 4, 12, 919528))

In [19]:
meili_client.index('llm-rag-code-keywords').delete()

TaskInfo(task_uid=24, index_uid='llm-rag-code-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 10, 28, 9, 4, 15, 88227))