## Document Functions

In [1]:
import requests
import json
import time

def get_document_data(
    document_url: str,
    document_type: str
) -> any:
    data = None
    response = requests.get(
        url = document_url
    )
    if response.status_code == 200:
        if document_type == 'text':
            data = response.text
        if document_type == 'json':
            data = json.loads(response.text)
        # handle html later
    return data

def scrape_documents(
    url_list: any,
    timeout: int
) -> any:
    documents = []

    text_files = [
        'py',
        'md',
        'yaml',
        'sh'
    ]

    json_files = [
        'ipynb'
    ]
    index = 0
    for url in url_list:
        document = {
            'name': '',
            'data': ''
        }
        url_split = url.split('/')
        if 'github' in url_split[2]:
            if 'raw' in url_split[2]:
                file_end = url_split[-1].split('.')[-1]
                document['name'] = url_split[-1]
                if file_end in text_files:
                    document['data'] = get_document_data(
                        document_url = url,
                        document_type = 'text' 
                    )
                if file_end in json_files:
                    document['data'] = get_document_data(
                        document_url = url,
                        document_type = 'json' 
                    )
        documents.append(document)
        index = index + 1
        if index < len(url_list):
            time.sleep(timeout)
    return documents

# Scraping

In [2]:
wanted_urls = [
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/experiments/article/cloud-hpc/Cloud-HPC-FMNIST-Experiment.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/backend/functions/platforms/celery.py',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/frontend/functions/platforms/redis.py',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/deployment/monitoring/kustomization.yaml',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/deployment/production/stack.yaml'
]

In [2]:
wanted_urls = [
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/backend/functions/platforms/celery.py',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/deployment/production/stack.yaml'
]

In [3]:
scraped_documents = scrape_documents(
    url_list = wanted_urls,
    timeout = 5
)

## Parsing Functions

In [4]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
import re

def tree_extract_imports(
    node: any, 
    code_text: str
) -> any:
    imports = []
    if node.type == 'import_statement' or node.type == 'import_from_statement':
        start_byte = node.start_byte
        end_byte = node.end_byte
        imports.append(code_text[start_byte:end_byte].decode('utf8'))
    for child in node.children:
        imports.extend(tree_extract_imports(child, code_text))
    return imports

def tree_extract_dependencies(
    node: any, 
    code_text: str
) -> any:
    dependencies = []
    for child in node.children:
        if child.type == 'call':
            dependency_name = child.child_by_field_name('function').text.decode('utf8')
            dependencies.append(dependency_name)
        dependencies.extend(tree_extract_dependencies(child, code_text))
    return dependencies

def tree_extract_code_and_dependencies(
    node: any,
    code_text: str
) -> any:
    codes = []
    if not node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name')
        if name is None:
            code = code_text[start_byte:end_byte].decode('utf8')
            if not 'def' in code:
                dependencies = tree_extract_dependencies(node, code_text)
                codes.append({
                    'name': 'global',
                    'code': code,
                    'dependencies': dependencies
                })
    return codes

def tree_extract_functions_and_dependencies(
    node: any, 
    code_text: str
) -> any:
    functions = []
    if node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name').text.decode('utf8')
        code = code_text[start_byte:end_byte].decode('utf8')
        dependencies = tree_extract_dependencies(node, code_text)
        functions.append({
            'name': name,
            'code': code,
            'dependencies': dependencies
        })
    for child in node.children:
        functions.extend(tree_extract_functions_and_dependencies(child, code_text))
    return functions

def tree_get_used_imports(
    general_imports: any,
    function_dependencies: any
) -> any:
    parsed_imports = {}
    for code_import in general_imports:
        import_factors = code_import.split('import')[-1].replace(' ', '')
        import_factors = import_factors.split(',')
    
        for factor in import_factors:
            if not factor in parsed_imports:
                parsed_imports[factor] = code_import.split('import')[0] + 'import ' + factor
            
    relevant_imports = {}
    for dependency in function_dependencies:
        initial_term = dependency.split('.')[0]
    
        if not initial_term in relevant_imports:
            if initial_term in parsed_imports:
                relevant_imports[initial_term] = parsed_imports[initial_term]
    
    used_imports = []
    for name, code in relevant_imports.items():
        used_imports.append(code)

    return used_imports

def tree_get_used_functions(
    general_functions: any,
    function_dependencies: any
): 
    used_functions = []
    for related_function_name in function_dependencies:
        for function in general_functions:
            if function['name'] == related_function_name:
                used_functions.append('from ice import ' + function['name'])
    return used_functions

def tree_create_code_document(
    code_imports: any,
    code_functions: any,
    function_item: any
) -> any:
    used_imports = tree_get_used_imports(
        general_imports = code_imports,
        function_dependencies = function_item['dependencies']
    )

    used_functions = tree_get_used_functions(
        general_functions = code_functions,
        function_dependencies = function_item['dependencies']
    )
    
    document = {
        'imports': used_imports,
        'functions': used_functions,
        'name': function_item['name'],
        'dependencies': function_item['dependencies'],
        'code': function_item['code']
    }
    
    return document
     
def tree_format_code_document(
    code_document: any
) -> any:
    formatted_document = ''
    for doc_import in code_document['imports']:
        formatted_document += doc_import + '\n'

    for doc_functions in code_document['functions']:
        formatted_document += doc_functions + '\n'

    if 0 < len(code_document['dependencies']):
        formatted_document += 'code dependencies\n'

        for doc_dependency in code_document['dependencies']:
            formatted_document += doc_dependency + '\n'

    if code_document['name'] == 'global':
        formatted_document += code_document['name'] + ' code\n'
    else:
        formatted_document += 'function ' + code_document['name'] + ' code\n'
    
    for line in code_document['code'].splitlines():
        if not bool(line.strip()):
            continue
        doc_code = re.sub(r'#.*','', line)
        if not bool(doc_code.strip()):
            continue
        formatted_document += doc_code + '\n'    
    return formatted_document

def tree_create_python_code_and_function_documents(
    code_document: any
):
    PY_LANGUAGE = Language(tspython.language())
    parser = Parser(PY_LANGUAGE)
   
    tree = parser.parse(
        bytes(
            code_document,
            "utf8"
        )
    )

    root_node = tree.root_node
    code_imports = tree_extract_imports(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    code_global = tree_extract_code_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    code_functions = tree_extract_functions_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )
    
    initial_documents = []
    for item in code_global:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    for item in code_functions:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    formatted_documents = []
    seen_functions = []
    for document in initial_documents:
        if not document['name'] == 'global':
            if document['name'] in seen_functions:
                continue
        
        formatted_document = tree_format_code_document(
            code_document = document
        )

        formatted_documents.append(formatted_document)
        seen_functions.append(document['name'])
    return formatted_documents

## Document Functions

In [5]:
import nbformat
from bs4 import BeautifulSoup
import markdown

def extract_jupyter_notebook_markdown_and_code(
    notebook_document: any
): 
    notebook_documents = {
        'markdown': [],
        'code': []
    }

    notebook = nbformat.from_dict(notebook_document)

    index = 1
    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            notebook_documents['markdown'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
        if cell.cell_type == 'code':
            notebook_documents['code'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
    
    return notebook_documents
    
def parse_markdown_into_text(
    markdown_text: any
) -> any:
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, features='html.parser')
    text = soup.get_text()
    code_block_pattern = re.compile(r"```")
    text = re.sub(code_block_pattern, '', text)
    text = text.rstrip('\n')
    text = text.replace('\nsh', '\n')
    text = text.replace('\nbash', '\n')
    return text

def create_python_documents(
    python_document: any
): 
    joined_code = ''.join(python_document)
    block_code_documents = tree_create_python_code_and_function_documents(
        code_document = joined_code
    )

    code_documents = []
    seen_function_names = []
    code_doc_index = 0
    for code_doc in block_code_documents:
        row_split = code_doc.split('\n')
        for row in row_split:
            if 'function' in row and 'code' in row:
                function_name = row.split(' ')[1]
                if not function_name in seen_function_names:
                    seen_function_names.append(function_name)
                else:
                    del block_code_documents[code_doc_index]
        code_doc_index += 1

    if 0 < len(block_code_documents):
        index = 1
        for code_doc in block_code_documents:
            code_documents.append({
                'index': index,
                'data': code_doc
            })
            index += 1
        
    formatted_documents = {
        'code': code_documents
    }
    return formatted_documents

def create_notebook_documents(
    notebook_document: any
):
    notebook_documents = extract_jupyter_notebook_markdown_and_code(
        notebook_document = notebook_document
    )

    markdown_documents = []
    for block in notebook_documents['markdown']:
        joined_text = ''.join(block['data'])
        markdown_text = parse_markdown_into_text(
            markdown_text = joined_text
        )
        markdown_documents.append({
            'index': block['id'],
            'data': markdown_text
        })
        
    code_documents = []
    seen_function_names = []
    for block in notebook_documents['code']:
        joined_code = ''.join(block['data'])
        block_code_documents = tree_create_python_code_and_function_documents(
            code_document = joined_code
        )

        code_doc_index = 0
        for code_doc in block_code_documents:
            row_split = code_doc.split('\n')
            for row in row_split:
                if 'function' in row and 'code' in row:
                    function_name = row.split(' ')[1]
                    if not function_name in seen_function_names:
                        seen_function_names.append(function_name)
                    else:
                        del block_code_documents[code_doc_index]
            code_doc_index += 1
        
        if 0 < len(block_code_documents):
            sub_indexes = False
            if 1 < len(block_code_documents):
                sub_indexes = True
            index = 1
            for code_doc in block_code_documents:
                if sub_indexes:
                    code_documents.append({
                        'sub-index': index, 
                        'index': block['id'],
                        'data': code_doc
                    })
                else:
                    code_documents.append({ 
                        'index': block['id'],
                        'data': code_doc
                    })
                index += 1
            
    formatted_documents = {
        'markdown': markdown_documents,
        'code': code_documents
    }
    
    return formatted_documents

## Mongo Functions

In [6]:
from pymongo import MongoClient as mc

def mongo_is_client(
    storage_client: any
) -> any:
    return isinstance(storage_client, mc.Connection)

def mongo_setup_client(
    username: str,
    password: str,
    address: str,
    port: str
) -> any:
    connection_prefix = 'mongodb://(username):(password)@(address):(port)/'
    connection_address = connection_prefix.replace('(username)', username)
    connection_address = connection_address.replace('(password)', password)
    connection_address = connection_address.replace('(address)', address)
    connection_address = connection_address.replace('(port)', port)
    mongo_client = mc(
        host = connection_address
    )
    return mongo_client

def mongo_get_database(
    mongo_client: any,
    database_name: str
) -> any:
    try:
        database = mongo_client[database_name]
        return database
    except Exception as e:
        return None

def mongo_check_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database_exists = database_name in mongo_client.list_database_names()
        return database_exists
    except Exception as e:
        return False

def mongo_list_databases(
    mongo_client: any
) -> any:
    try:
        databases = mongo_client.list_database_names()
        return databases
    except Exception as e:
        return []

def mongo_remove_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        mongo_client.drop_database(database_name)
        return True
    except Exception as e:
        return False

def mongo_get_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collection = database[collection_name]
        return collection
    except Exception as e:
        return None
    
def mongo_check_collection(
    mongo_client: any, 
    database_name: any, 
    collection_name: any
) -> bool:
    try:
        database = mongo_client[database_name]
        collection_exists = collection_name in database.list_collection_names()
        return collection_exists
    except Exception as e:
        return False

def mongo_update_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any, 
    update_query: any
) -> any:
    try:
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_many(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_list_collections(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collections = database.list_collection_names()
        return collections
    except Exception as e:
        return []

def mongo_remove_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try: 
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        database.drop_collection(collection_name)
        return True
    except Exception as e:
        return False

def mongo_create_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    document: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.insert_one(document)
        return result
    except Exception as e:
        return None

def mongo_get_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
):
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        document = collection.find_one(filter_query)
        return document
    except Exception as e:
        return None 

def mongo_list_documents(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any,
    sorting_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        documents = list(collection.find(filter_query).sort(sorting_query))
        return documents
    except Exception as e:
        return []

def mongo_update_document(
    mongo_client: any, 
    database_name: any, 
    collection_name: any, 
    filter_query: any, 
    update_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_one(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_remove_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
) -> bool:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.delete_one(filter_query)
        return result
    except Exception as e:
        return None

# Storage

In [7]:
def store_scraped_documents(
    mongo_client: any,
    database_prefix: str,
    documents: any
):
    for document in documents:
        file_name = document['name']
        file_data = document['data']
        
        formatted_documents = {}
        if '.ipynb' in file_name:
            formatted_documents = create_notebook_documents(
                notebook_document = file_data
            )
            document_database_name = database_prefix + '-workflows'
        if '.py' in file_name:
            formatted_documents = create_python_documents(
                python_document = file_data
            )
            document_database_name = database_prefix + '-code'
        
        for doc_type, doc_data in formatted_documents.items():
            for document in doc_data:
                document_data = document['data']
                document_index = document['index']
                document_sub_index = 0

                if 'sub-index' in document:
                    document_sub_index = document['sub-index']
                
                result = mongo_create_document(
                    mongo_client = mongo_client,
                    database_name = document_database_name,
                    collection_name = file_name,
                    document = {
                        'index': int(document_index),
                        'sub-index': int(document_sub_index),
                        'type': doc_type,
                        'data': document_data
                    }
                )

In [8]:
mongo_client = mongo_setup_client(
    username = 'mongo123',
    password = 'mongo456',
    address = '127.0.0.1',
    port = '27017'
)

In [9]:
store_scraped_documents(
    mongo_client = mongo_client,
    database_prefix = 'llm-rag',
    documents = scraped_documents
)

## LangChain Functions

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

def langchain_generate_code_document_chunks(
    language: any,
    chunk_size: int,
    chunk_overlap: int,
    document: any
) -> any:
    splitter = RecursiveCharacterTextSplitter.from_language(
        language = language,
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap
    )

    document_chunks = splitter.create_documents([document])
    document_chunks = [doc.page_content for doc in document_chunks]
    return document_chunks

def langchain_generate_document_chunk_embeddings(
    model_name: str,
    document_chunks: any
) -> any:
    embedding_model = HuggingFaceEmbeddings(
        model_name = model_name
    )
    chunk_embeddings = embedding_model.embed_documents(
        texts = document_chunks
    )
    return chunk_embeddings

## Qdrant Functions

In [57]:
from qdrant_client import QdrantClient as qc
from qdrant_client import models

def qdrant_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, qc.Connection)
    except Exception as e:
        return False

def qdrant_setup_client(
    api_key: str,
    address: str, 
    port: str
) -> any:
    try:
        qdrant_client = qc(
            host = address,
            port = int(port),
            api_key = api_key,
            https = False
        ) 
        return qdrant_client
    except Exception as e:
        return None

def qdrant_create_collection(
    qdrant_client: any, 
    collection_name: str,
    configuration: any
) -> any:
    try:
        result = qdrant_client.create_collection(
            collection_name = collection_name,
            vectors_config = configuration
        )
        return result
    except Exception as e:
        return None

def qdrant_get_collection(
    qdrant_client: any, 
    collection_name: str
) -> any:
    try:
        collection = qdrant_client.get_collection(
            collection_name = collection_name
        )
        return collection
    except Exception as e:
        return None

def qdrant_list_collections(
    qdrant_client: any
) -> any:
    try:
        collections = qdrant_client.get_collections()
        collection_list = []
        for description in collections.collections:
            collection_list.append(description.name)
        return collection_list
    except Exception as e:
        return []

def qdrant_remove_collection(
    qdrant_client: any, 
    collection_name: str
) -> bool:
    try:
        qdrant_client.delete_collection(collection_name)
        return True
    except Exception as e:
        return False

def qdrant_upsert_points(
    qdrant_client: qc, 
    collection_name: str,
    points: any
) -> any:
    try:
        results = qdrant_client.upsert(
            collection_name = collection_name, 
            points = points
        )
        return results
    except Exception as e:
        print(e)
        return None

def qdrant_search_data(
    qdrant_client: qc,  
    collection_name: str,
    scroll_filter: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.scroll(
            collection_name = collection_name,
            scroll_filter = scroll_filter,
            limit = limit
        )
        return hits
    except Exception as e:
        print(e)
        return []

def qdrant_search_vectors(
    qdrant_client: qc,  
    collection_name: str,
    query_vector: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.search(
            collection_name = collection_name,
            query_vector = query_vector,
            limit = limit
        )
        return hits
    except Exception as e:
        return []

def qdrant_remove_vectors(
    qdrant_client: qc,  
    collection_name: str, 
    vectors: str
) -> bool:
    try:
        results = qdrant_client.delete_vectors(
            collection_name = collection_name,
            vectors = vectors
        )
        return results
    except Exception as e:
        print(f"Error removing document: {e}")
        return None


# Vector Embeddings

In [13]:
database_prefix = 'llm-rag'
storage_structure = {}
database_list = mongo_list_databases(
    mongo_client = mongo_client
)
for database in database_list:
    if database_prefix in database:
        collection_list = mongo_list_collections(
            mongo_client = mongo_client,
            database_name = database
        )
        storage_structure[database] = collection_list

In [14]:
storage_structure

{'llm-rag-code': ['celery.py'], 'llm-rag-workflows': ['demo-pipeline.ipynb']}

In [15]:
from pymongo import ASCENDING, DESCENDING

storage_documents = {}
for database, collections in storage_structure.items():
    if not database in storage_documents:
        storage_documents[database] = {}
    for collection in collections:
        collection_documents = mongo_list_documents(
            mongo_client = mongo_client,
            database_name = database,
            collection_name = collection,
            filter_query = {},
            sorting_query = [
                ('index', ASCENDING),
                ('sub-index', ASCENDING)
            ]
        )
        storage_documents[database][collection] = collection_documents

In [16]:
def langchain_generate_code_document_chunks(
    language: any,
    chunk_size: int,
    chunk_overlap: int,
    document: any
) -> any:
    splitter = RecursiveCharacterTextSplitter.from_language(
        language = language,
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap
    )

    document_chunks = splitter.create_documents([document])
    document_chunks = [doc.page_content for doc in document_chunks]
    return document_chunks

def lanchain_generate_text_document_chunks(
    chunk_size: int,
    chunk_overlap: int,
    document: any
) -> any:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap,
        length_function=len,
        is_separator_regex=False
    )

    document_chunks = splitter.create_documents([document])
    document_chunks = [doc.page_content for doc in document_chunks]
    return document_chunks

def langchain_generate_document_chunk_embeddings(
    model_name: str,
    document_chunks: any
) -> any:
    embedding_model = HuggingFaceEmbeddings(
        model_name = model_name
    )
    chunk_embeddings = embedding_model.embed_documents(
        texts = document_chunks
    )
    return chunk_embeddings

In [17]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

def generate_document_vector_packet(
    document: any,
    configuration: any,
) -> any:
    document_type = document['type']
    used_configuration = configuration[document_type]
    
    document_chunks = []
    if document_type == 'code':
        document_chunks = langchain_generate_code_document_chunks(
            language = Language.PYTHON,
            chunk_size = used_configuration['chunk-size'],
            chunk_overlap = used_configuration['chunk-overlap'],
            document = document['data']
        )
    if document_type == 'markdown':
        document_chunks = lanchain_generate_text_document_chunks(
            chunk_size = used_configuration['chunk-size'],
            chunk_overlap = used_configuration['chunk-overlap'],
            document = document['data']
        )
        
    vector_embedding = langchain_generate_document_chunk_embeddings(
        model_name = used_configuration['model-name'],
        document_chunks = document_chunks
    )

    packet = {
        'chunks': document_chunks,
        'embeddings': vector_embedding
    }
    
    return packet

In [114]:
example_code_document = storage_documents['llm-rag-code']['celery.py'][0]

In [115]:
example_text_document = storage_documents['llm-rag-workflows']['demo-pipeline.ipynb'][0]

In [118]:
example_code_vector_embedding = generate_document_vector_embedding(
    document = example_code_document,
    chunk_size = 50,
    chunk_overlap = 0,
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)

  from tqdm.autonotebook import tqdm, trange


In [119]:
example_text_vector_embedding = generate_document_vector_embedding(
    document = example_text_document,
    chunk_size = 50,
    chunk_overlap = 0,
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)



In [18]:
qdrant_client = qdrant_setup_client(
    api_key = 'qdrant_key',
    address = '127.0.0.1', 
    port = '6333'
)



In [37]:
import uuid
vector_id = '671b31b2f12238512f2cafdd' + '-1'
vector_uuid = uuid.uuid5(uuid.NAMESPACE_DNS, vector_id)
print(str(vector_uuid))

bcf76e9e-3863-5e32-89b0-3a23cb3e80b0


In [55]:
existing_chunk = qdrant_search_data(
    qdrant_client = qdrant_client, 
    collection_name = 'llm-rag-workflows-embeddings',
    query_filter = {
        "must": [
            {
                "key": "chunk_hash", 
                "match": {
                    "value": '3c177a2a09134a89280e93580f24d2d9'
                }
            }
        ]
    },
    limit = 5
)

TypeError: qdrant_search_data() got an unexpected keyword argument 'query_filter'

In [63]:
existing_chunks = qdrant_search_data(
    qdrant_client = qdrant_client,
    collection_name = 'llm-rag-workflows-embeddings',
    scroll_filter = models.Filter(
        must = [
            models.FieldCondition(
                key='chunk_hash',
                match = models.MatchValue(value="3c177a2a09134a89280e93580f24d2d9")
            )
        ]
    ),
    limit = 5
)
len(existing_chunks[0])

4

In [73]:
sample_vector = [-0.0008806156,0.07525727,-0.039082013,0.016676342,0.019496996,-0.0183225,0.10499067,0.007775761,-0.009872934,-0.03422199,-0.052432895,-0.0108636115,0.078131415,-0.0073763705,-0.024622837,0.0040045134,0.029577108,-0.060306843,-0.023875061,-0.05675382,0.02992669,0.037502546,0.008587845,0.1237185,-0.08706975,0.011235719,0.034271877,0.008478244,-0.02619887,0.004774774,0.018081691,-0.062485863,-0.01694757,-0.017412446,0.09253343,0.034455594,0.037253767,-0.07208341,0.057675894,-0.061799947,-0.018354114,-0.046897687,-0.023256231,0.003918217,-0.011575811,0.04200087,-0.05075061,-0.042651433,0.06726092,0.010764077,-0.047839403,-0.032385066,0.0016082315,-0.027480885,0.050767735,0.014470795,-0.01207069,0.002131377,-0.029967006,0.016777797,0.04471435,-0.06823574,0.045421507,-0.026454143,-0.009379231,-0.0102507975,-0.086166084,-0.06292877,-0.02551088,-0.055481803,-0.01607503,0.056653056,-0.00003854536,0.015639594,0.01604736,-0.08630768,0.020671349,-0.0911355,-0.030291177,-0.020271132,-0.07643146,-0.028326187,-0.040162094,0.006570532,0.11752413,-0.08517199,-0.027548267,-0.000120679266,0.06321616,-0.07607285,-0.047586102,-0.05989854,-0.004406365,0.031803537,0.0628275,-0.019911483,0.04521456,0.028922873,-0.041679494,0.040077474,-0.032429293,-0.034646623,-0.019596737,0.017813457,-0.017751787,0.053845163,-0.017280385,0.028979478,-0.051279325,-0.09619954,-0.0799265,-0.00690425,0.0343595,0.06598981,0.012640191,-0.03366004,0.012655348,0.11719685,0.0028017166,-0.023670634,0.14810836,0.008867465,-0.020808151,0.02270183,-0.08633888,-0.05448335,0.06953763,-4.0176204e-33,0.04469456,-0.011172801,0.0124054905,0.022486169,0.011005086,-0.008940501,-0.036023315,0.06895857,0.03385835,0.023042355,-0.036956806,-0.036285393,-0.019124702,0.037292037,-0.016623108,0.01791072,-0.0021501917,0.06982369,-0.0078103733,0.011628215,-0.025783157,-0.014041182,-0.094312415,0.024015969,0.04719983,0.0202716,0.0059256833,-0.023461312,-0.032074198,0.0030213667,-0.05630584,0.016784912,-0.041350484,-0.010510119,-0.032780495,0.10691323,0.015510786,-0.022785041,-0.003027085,-0.01625067,0.06831153,0.07440746,0.023756888,-0.018152632,0.012086324,-0.038340032,0.0063405824,-0.00086272124,0.0042086556,-0.025579445,-0.0050925,-0.024872497,0.026971173,-0.05092251,0.00025128145,-0.08662072,0.0539211,0.053984,0.040535644,-0.068476476,0.026984764,-0.06850057,-0.051462118,0.042254355,0.03951172,0.055016063,0.060717493,0.06802623,0.10957967,0.02805403,-0.06449128,-0.022178398,-0.00060761836,0.024709838,-0.016398955,-0.013924856,-0.003000167,-0.12039671,-0.099169396,0.065107524,-0.042745173,-0.0132074375,-0.109735966,-0.079504326,-0.013722966,0.07654049,-0.037829973,0.003370593,-0.050290912,0.03221142,-0.058589935,0.04097386,-0.04631355,0.03644445,-0.057122815,5.1131777e-34,-0.036972407,0.00331556,-0.05163985,-0.08187577,0.08633707,-0.025570264,0.15140791,-0.07599442,-0.09741537,0.041227363,-0.0020351557,-0.15771613,-0.06736124,0.041812494,0.12581694,0.03375963,-0.030657541,0.04516917,-0.061553475,0.018996006,-0.12033124,0.14018124,-0.019660208,0.008262802,0.058510397,-0.005143964,-0.0030865462,-0.018254856,-0.0015348716,-0.0881829,-0.00154272,-0.077873126,-0.08744284,0.09301118,0.0075481553,0.012195563,-0.044611398,-0.066665746,-0.038523663,-0.0030063111,0.075059116,0.009463261,0.02251678,0.13548115,-0.0037547234,0.045695756,0.022946576,-0.00007905387,-0.099864155,0.037794977,0.0062554525,-0.03725619,-0.064578496,-0.023896668,0.07721146,-0.10511356,-0.0059987954,-0.070528,-0.036697168,-0.022170186,0.054968473,0.06914329,-0.019348279,0.07726558,0.022413678,-0.047802236,-0.04092273,0.014742518,0.06048497,-0.0032601384,0.0014216868,0.024004899,0.030322358,0.02016151,0.0041131,-0.021624237,-0.03957565,0.07552214,-0.06119625,0.01242295,-0.005579958,-0.05356618,0.0481288,-0.015847042,-0.1067542,-0.0038143944,0.098899946,0.10055497,-0.1070229,-0.015711667,0.050163187,-0.015227208,0.0012801245,0.019454584,-0.014298509,-1.6927226e-8,0.021207215,0.09786308,-0.008242949,-0.0599221,-0.011646559,-0.01923452,-0.037953626,-0.003184353,-0.0009942062,-0.028584054,0.024394933,-0.10110541,0.058031734,0.045483343,0.0056083733,-0.030457597,-0.06479304,0.121118,0.04755318,0.0004189592,0.042921025,0.02015465,-0.06455849,-0.058625087,0.045303024,-0.034165505,-0.046124157,0.05671656,0.051498257,0.047485758,0.02200334,0.018044686,-0.020871876,0.050238602,0.07088494,-0.03550859,0.03834522,0.027851177,0.0061599715,-0.0051157954,0.029486923,0.057594728,0.02301489,0.033285838,0.0062656887,-0.010056981,-0.031857565,-0.029003937,0.039308313,0.006076029,-0.06574017,-0.053721618,0.076442,0.0115394825,-0.011749002,0.028453192,0.09319499,-0.033753525,-0.061961908,0.0005717238,0.08093922,0.09229381,-0.013735305,0.020243453]

In [81]:
vectors = qdrant_search_vectors(
    qdrant_client = qdrant_client,
    collection_name = 'llm-rag-workflows-embeddings',
    query_vector = sample_vector,
    limit = 10
)
#print(vectors)
for case in vectors:
    print(case.score)
    print(repr(case.payload['chunk']))

0.7621589
'run_id=train_task.outputs["run_id"],'
0.6500319
'preprocess_task ='
0.64411044
'train_task = train('
0.6295434
'run_id = run.info.run_id'
0.5800556
')\n        inference_task = inference('
0.5717762
'run_id)'
0.5699499
'test_set=preprocess_task.outputs["test_set"],'
0.55575526
'train_set=preprocess_task.outputs["train_set"],'
0.5521839
'):\n    pull_task = pull_data(url=url)'
0.5379623
'deploy_model_task = deploy_model('


In [65]:
from qdrant_client import QdrantClient, models
qdrant_client.scroll(
    collection_name = 'llm-rag-workflows-embeddings',
    scroll_filter = models.Filter(
        must = [
            models.FieldCondition(
                key='chunk_hash',
                match = models.MatchValue(value="3c177a2a09134a89280e93580f24d2d9")
            )
        ]
    ),
    limit = 5
)

([Record(id='005732c9-4ce0-5230-bb85-96de0b7b6c1c', payload={'database': 'llm-rag-workflows', 'collection': 'demo-pipeline.ipynb', 'id': '671b31b2f12238512f2cafd1', 'chunk': 'if len(resp.history) == 0:', 'chunk_hash': '3c177a2a09134a89280e93580f24d2d9'}, vector=None, shard_key=None, order_value=None),
  Record(id='b7416704-cf40-521d-865d-a4ba79fc56d4', payload={'database': 'llm-rag-workflows', 'collection': 'demo-pipeline.ipynb', 'id': '671b31b2f12238512f2cafd1', 'chunk': 'if len(resp.history) == 0:', 'chunk_hash': '3c177a2a09134a89280e93580f24d2d9'}, vector=None, shard_key=None, order_value=None)],
 None)

In [46]:
existing_chunk

[]

In [120]:
import hashlib
import numpy as np
import uuid
import re
from qdrant_client.models import VectorParams, Distance
from qdrant_client.models import PointStruct

# for preventing duplicates
def format_chunk(
    document_chunk: any
) -> any:
    chunk = re.sub(r'[^\w\s]', '', document_chunk)
    chunk = re.sub(r'\s+', ' ', chunk) 
    chunk = chunk.strip()
    chunk = chunk.lower()
    # This helps to remove unique hashes for duplicates such as:
    # task_id = task_id )
    # task_id = task_id 
    # task_id = task_id )
    return chunk

def generate_chunk_hash(
    document_chunk: any
) -> any:
    cleaned_chunk = format_chunk(
        document_chunk = document_chunk
    )
    return hashlib.md5(cleaned_chunk.encode('utf-8')).hexdigest()

vector_configuration = {
    'code': {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2'
    },
    'markdown': {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2'
    }
}

# 
for database, collections in storage_documents.items():
    vector_collection_name = database + '-embeddings'
    for collection, documents in collections.items():
        for document in documents:
            document_id = str(document['_id'])
            document_type = document['type']
            
            vector_packet = generate_document_vector_packet(
                document = document,
                configuration = vector_configuration
            )
            
            document_chunks = vector_packet['chunks']
            document_embeddings = vector_packet['embeddings']
            if 0 < len(document_embeddings):
                vector_collections = qdrant_list_collections(
                    qdrant_client = qdrant_client
                )

                if not vector_collection_name in vector_collections:
                    vector_collection_configuration = VectorParams(
                          size = len(document_embeddings[0]), 
                          distance = Distance.COSINE
                    )
                    collection_created = qdrant_create_collection(
                        qdrant_client = qdrant_client,
                        collection_name = vector_collection_name,
                        configuration = vector_collection_configuration
                    )

                vector_points = []
                vector_index = 0
                added_hashes = []
                for chunk in document_chunks:
                    vector_id = document_id + '-' + str(vector_index + 1)
                    vector_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, vector_id))

                    chunk_hash = generate_chunk_hash(
                        document_chunk = chunk
                    )
                   
                    existing_chunks = qdrant_search_data(
                        qdrant_client = qdrant_client,
                        collection_name = vector_collection_name,
                        scroll_filter = models.Filter(
                            must = [
                                models.FieldCondition(
                                    key = 'chunk_hash',
                                    match = models.MatchValue(
                                        value = chunk_hash
                                    )
                                )
                            ]
                        ),
                        limit = 1
                    )
                    # Removes duplicates
                    if len(existing_chunks[0]) == 0:
                        if not chunk_hash in added_hashes:
                            given_vector = document_embeddings[vector_index]

                            chunk_point = PointStruct(
                                id = vector_uuid, 
                                vector = given_vector,
                                payload = {
                                    'database': database,
                                    'collection': collection,
                                    'document': document_id,
                                    'type': document_type,
                                    'chunk': chunk,
                                    'chunk_hash': chunk_hash
                                }
                            )
                            added_hashes.append(chunk_hash)
                            vector_points.append(chunk_point)
                    vector_index += 1

                if 0 < len(vector_points):
                    points_stored = qdrant_upsert_points(
                        qdrant_client = qdrant_client, 
                        collection_name = vector_collection_name,
                        points = vector_points
                    )



## NLTK Functions

In [89]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt_tab')
nltk.download('stopwords')

def get_document_keywords(
    document: any
):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    text = document.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 1]
    
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = list(dict.fromkeys(tokens))

    return tokens

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Meili Functions

In [87]:
import meilisearch as ms

def meili_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, ms.Connection)
    except Exception as e:
        print(e)
        return False

def meili_setup_client(
    host: str, 
    api_key: str
) -> any:
    try:
        meili_client = ms.Client(
            url = host, 
            api_key = api_key
        )
        return meili_client 
    except Exception as e:
        print(e)
        return None

def meili_get_index( 
    meili_client: any, 
    index_name: str
) -> any:
    try:
        index = meili_client.index(
            uid = index_name
        )
        return index
    except Exception as e:
        print(e)
        return None
    
def meili_check_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        meili_client.get_index(
            uid = index_name
        )
        return True
    except Exception as e:
        print(e)
        return False
    
def meili_remove_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        response = meili_client.index(
            index_name = index_name
        ).delete()
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_list_indexes(
    meili_client: any
) -> bool:
    try:
        indexes = meili_client.get_indexes()
        return indexes
    except Exception as e:
        print(e)
        return None

def meili_add_documents(
    meili_client: any, 
    index_name: str, 
    documents: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.add_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_search_documents(
    meili_client: any, 
    index_name: str, 
    query: any, 
    options: any
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.search(
            query = query, 
            options = options
        )
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_update_documents(
    meili_client, 
    index_name, 
    documents
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.update_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_delete_documents(
    meili_client: any, 
    index_name: str, 
    ids: any
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.delete_documents(
            document_ids = ids
        )
        return response
    except Exception as e:
        print(e)
        return None

## Keyword Generation

In [94]:
meili_client = meili_setup_client(
    host = 'http://127.0.0.1:7700', 
    api_key = 'meili_key'
)

In [119]:
for database, collections in storage_documents.items():
    keyword_collection_name = database + '-keywords'
    keyword_documents = []
    for collection, documents in collections.items():
        document_index = 0
        for document in documents:
            document_id = str(document['_id'])
            document_data = document['data']
            document_type = document['type']
            
            document_keywords = get_document_keywords(
                document = document_data
            )
            
            keyword_id = document_id + '-' + str(document_index + 1)
            keyword_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, keyword_id))

            payload = {
                'id': keyword_uuid,
                'database': database,
                'collection': collection,
                'document': document_id,
                'type': document_type,
                'keywords': document_keywords
            }

            keyword_documents.append(payload)
        
    stored = meili_add_documents(
        meili_client = meili_client,
        index_name = keyword_collection_name,
        documents = keyword_documents
    )

In [112]:
keyword_documents

[{'id': 'dd06dcd8-c9c5-51bd-9d78-d8f5725af93c',
  'database': 'llm-rag-workflows',
  'collection': 'demo-pipeline.ipynb',
  'document_id': '671b31b1f12238512f2cafbc',
  'type': 'markdown',
  'keywords': ['demo', 'kfp', 'pipelin']},
 {'id': '41fa29aa-7fce-5ef7-8bd0-4a2a6428e5ad',
  'database': 'llm-rag-workflows',
  'collection': 'demo-pipeline.ipynb',
  'document_id': '671b31b2f12238512f2cafbd',
  'type': 'markdown',
  'keywords': ['instal', 'requir']},
 {'id': '2fe312e6-e4cf-5a33-9c02-c4e99ff582bc',
  'database': 'llm-rag-workflows',
  'collection': 'demo-pipeline.ipynb',
  'document_id': '671b31b2f12238512f2cafcf',
  'type': 'code',
  'keywords': ['global', 'code', 'bash', 'pip', 'instal', 'kfp~=1.8.14']},
 {'id': '30731e7b-17c4-51b7-8359-d8c47a0fb041',
  'database': 'llm-rag-workflows',
  'collection': 'demo-pipeline.ipynb',
  'document_id': '671b31b2f12238512f2cafbe',
  'type': 'markdown',
  'keywords': ['import']},
 {'id': '7ac09ad2-7723-5857-a392-124513aa7dfc',
  'database': 'llm

In [106]:
index = meili_client.index('test')

In [107]:
index.add_documents(keyword_documents)

TaskInfo(task_uid=7, index_uid='test', status='enqueued', type='documentAdditionOrUpdate', enqueued_at=datetime.datetime(2024, 10, 25, 12, 4, 54, 993824))

In [108]:
meili_client.index('test').delete()

TaskInfo(task_uid=8, index_uid='test', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 10, 25, 12, 10, 17, 391692))

In [116]:
meili_client.index('llm-rag-workflows-keywords').delete()

TaskInfo(task_uid=19, index_uid='llm-rag-workflows-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 10, 25, 12, 16, 26, 461341))

In [117]:
meili_client.index('llm-rag-code-keywords').delete()

TaskInfo(task_uid=20, index_uid='llm-rag-code-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 10, 25, 12, 16, 28, 891786))