## General Functions

In [5]:
import re
# Created and works
def set_formatted_user(
    user: str   
) -> any:
    return re.sub(r'[^a-z0-9]+', '-', user)

## SWIFT Functions

In [6]:
from decouple import Config,RepositoryEnv

from keystoneauth1 import loading, session
from keystoneauth1.identity import v3
from keystoneclient.v3 import client as keystone_client

import swiftclient as sc
import pickle

In [7]:
# Works
def is_swift_client(
    storage_client: any
) -> any:
    return isinstance(storage_client, sc.Connection)
# Works
def swift_setup_client(
    pre_auth_url: str,
    pre_auth_token: str,
    user_domain_name: str,
    project_domain_name: str,
    project_name: str,
    auth_version: str
) -> any:
    swift_client = sc.Connection(
        preauthurl = pre_auth_url,
        preauthtoken = pre_auth_token,
        os_options = {
            'user_domain_name': user_domain_name,
            'project_domain_name': project_domain_name,
            'project_name': project_name
        },
        auth_version = auth_version
    )
    return swift_client
# Works
def swift_create_bucket(
    swift_client: any,
    bucket_name: str
) -> bool:
    try:
        swift_client.put_container(
            container = bucket_name
        )
        return True
    except Exception as e:
        return False
# Works
def swift_check_bucket(
    swift_client: any,
    bucket_name:str
) -> any:
    try:
        bucket_info = swift_client.get_container(
            container = bucket_name
        )
        bucket_metadata = bucket_info[0]
        list_of_objects = bucket_info[1]
        return {'metadata': bucket_metadata, 'objects': list_of_objects}
    except Exception as e:
        return {} 
# Refactored
def swift_delete_bucket(
    swift_client: any,
    bucket_name: str
) -> bool:
    try:
        swift_client.delete_container(
            container = bucket_name
        )
        return True
    except Exception as e:
        return False
# Created
def swift_list_buckets(
    swift_client: any
) -> any:
    try:
        account_buckets = swift_client.get_account()[1]
        buckets = {}
        for bucket in account_buckets:
            bucket_name = bucket['name']
            bucket_count = bucket['count']
            bucket_size = bucket['bytes']
            buckets[bucket_name] = {
                'amount': bucket_count,
                'size': bucket_size
            }
        return buckets
    except Exception as e:
        return {}
# Works
def swift_create_object(
    swift_client: any,
    bucket_name: str, 
    object_path: str, 
    object_data: any,
    object_metadata: any
) -> bool: 
    # This should be updated to handle 5 GB objects
    # It also should handle metadata
    try:
        swift_client.put_object(
            container = bucket_name,
            obj = object_path,
            contents = object_data,
            headers = object_metadata
        )
        return True
    except Exception as e:
        return False
# Works
def swift_check_object(
    swift_client: any,
    bucket_name: str, 
    object_path: str
) -> any: 
    try:
        object_metadata = swift_client.head_object(
            container = bucket_name,
            obj = object_path
        )       
        return object_metadata
    except Exception as e:
        return {} 
# Refactored
def swift_get_object(
    swift_client:any,
    bucket_name: str,
    object_path: str
) -> any:
    # This should handle metadata
    try:
        response = swift_client.get_object(
            container = bucket_name,
            obj = object_path 
        )
        object_info = response[0]
        object_data = response[1]
        return {'data': object_data, 'info': object_info}
    except Exception as e:
        return {}     
# Refactored   
def swift_remove_object(
    swift_client: any,
    bucket_name: str, 
    object_path: str
) -> bool: 
    try:
        swift_client.delete_object(
            container = bucket_name, 
            obj = object_path
        )
        return True
    except Exception as e:
        return False
# Works
def swift_update_object(
    swift_client: any,
    bucket_name: str, 
    object_path: str, 
    object_data: any,
    object_metadata: any
) -> bool:  
    remove = swift_remove_object(
        swift_client = swift_client, 
        bucket_name = bucket_name, 
        object_path = object_path
    )
    if not remove:
        return False
    create = swift_create_object(
        swift_client = swift_client, 
        bucket_name = bucket_name, 
        object_path = object_path, 
        object_data = object_data,
        object_metadata = object_metadata
    )
    return create
# Works
def swift_create_or_update_object(
    swift_client: any,
    bucket_name: str, 
    object_path: str, 
    object_data: any,
    object_metadata: any
) -> any:
    bucket_info = swift_check_bucket(
        swift_client = swift_client, 
        bucket_name = bucket_name
    )
    
    if len(bucket_info) == 0:
        creation_status = swift_create_bucket(
            swift_client = swift_client, 
            bucket_name = bucket_name
        )
        if not creation_status:
            return False
    
    object_info = swift_check_object(
        swift_client = swift_client, 
        bucket_name = bucket_name, 
        object_path = object_path
    )
    
    if len(object_info) == 0:
        return swift_create_object(
            swift_client = swift_client, 
            bucket_name = bucket_name, 
            object_path = object_path, 
            object_data = object_data,
            object_metadata = object_metadata
        )
    else:
        return swift_update_object(
            swift_client = swift_client, 
            bucket_name = bucket_name, 
            object_path = object_path, 
            object_data = object_data,
            object_metadata = object_metadata
        )

## Storage Functions

In [8]:
# 3-2-2

# Refactored and Works
def set_encoded_metadata(
    used_client: str,
    object_metadata: any
) -> any:
    encoded_metadata = {}
    if used_client == 'swift':
        key_initial = 'x-object-meta'
        for key, value in object_metadata.items():
            encoded_key = key_initial + '-' + key
            if isinstance(value, list):
                encoded_metadata[encoded_key] = 'list=' + ','.join(map(str, value))
                continue
            encoded_metadata[encoded_key] = str(value)
    return encoded_metadata
# Refactored and works
def get_general_metadata(
    used_client: str,
    object_metadata: any
) -> any:
    general_metadata = {}
    if used_client == 'swift':
        key_initial = 'x-object-meta'
        for key, value in object_metadata.items():
            if not key_initial == key[:len(key_initial)]:
                general_metadata[key] = value
    return general_metadata
# Refactored and works
def get_decoded_metadata(
    used_client: str,
    object_metadata: any
) -> any: 
    decoded_metadata = {}
    if used_client == 'swift':
        key_initial = 'x-object-meta'
        for key, value in object_metadata.items():
            if key_initial == key[:len(key_initial)]:
                decoded_key = key[len(key_initial) + 1:]
                if 'list=' in value:
                    string_integers = value.split('=')[1]
                    values = string_integers.split(',')
                    if len(values) == 1 and values[0] == '':
                        decoded_metadata[decoded_key] = []
                    else:
                        try:
                            decoded_metadata[decoded_key] = list(map(int, values))
                        except:
                            decoded_metadata[decoded_key] = list(map(str, values))
                    continue
                if value.isnumeric():
                    decoded_metadata[decoded_key] = int(value)
                    continue
                decoded_metadata[decoded_key] = value
    return decoded_metadata
# Refactored and works
def set_bucket_names(
    storage_parameters: any
) -> any:
    storage_names = []
    bucket_prefix = storage_parameters['bucket-prefix']
    ice_id = storage_parameters['ice-id']
    user = storage_parameters['user']
    storage_names.append(bucket_prefix + '-forwarder-' + ice_id)
    storage_names.append(bucket_prefix + '-preprocessor-' + ice_id)
    storage_names.append(bucket_prefix + '-submitter-' + ice_id + '-' + set_formatted_user(user = user))
    storage_names.append(bucket_prefix + '-pipeline-' + ice_id + '-' + set_formatted_user(user = user))
    storage_names.append(bucket_prefix + '-experiment-' + ice_id + '-' + set_formatted_user(user = user))
    return storage_names
# Refactored and works
def setup_storage_client(
    storage_parameters: any
) -> any:
    storage_client = None
    if storage_parameters['used-client'] == 'swift':
        storage_client = swift_setup_client(
            pre_auth_url = storage_parameters['pre-auth-url'],
            pre_auth_token = storage_parameters['pre-auth-token'],
            user_domain_name = storage_parameters['user-domain-name'],
            project_domain_name = storage_parameters['project-domain-name'],
            project_name = storage_parameters['project-name'],
            auth_version = storage_parameters['auth-version']
        )
    return storage_client
# Refactored and works
def check_object_metadata(
    storage_client: any,
    bucket_name: str, 
    object_path: str
) -> any: 
    object_metadata = {
        'general-meta': {},
        'custom-meta': {}
    }
    if is_swift_client(storage_client = storage_client):
        all_metadata = swift_check_object(
           swift_client = storage_client,
           bucket_name = bucket_name,
           object_path = object_path
        ) 

        general_metadata = {}
        custom_metadata = {}
        if not len(all_metadata) == 0:
            general_metadata = get_general_metadata(
                used_client = 'swift',
                object_metadata = all_metadata
            )
            custom_metadata = get_decoded_metadata(
                used_client = 'swift',
                object_metadata = all_metadata
            )

        object_metadata['general-meta'] = general_metadata
        object_metadata['custom-meta'] = custom_metadata

    return object_metadata
# Refactored and works
def get_object_content(
    storage_client: any,
    bucket_name: str,
    object_path: str
) -> any:
    object_content = {}
    if is_swift_client(storage_client = storage_client):
        fetched_object = swift_get_object(
            swift_client = storage_client,
            bucket_name = bucket_name,
            object_path = object_path
        )
        object_content['data'] = pickle.loads(fetched_object['data'])
        object_content['general-meta'] = get_general_metadata(
            used_client = 'swift',
            object_metadata = fetched_object['info']
        )
        object_content['custom-meta'] = get_decoded_metadata(
            used_client = 'swift',
            object_metadata = fetched_object['info']
        )
    return object_content
# Refactored    
def remove_object(
    storage_client: any,
    bucket_name: str, 
    object_path: str
) -> bool: 
    removed = False
    if is_swift_client(storage_client = storage_client):
        removed = swift_remove_object(
            swift_client = storage_client,
            bucket_name = bucket_name,
            object_path = object_path
        )
    return removed
# Refactored and works
def create_or_update_object(
    storage_client: any,
    bucket_name: str, 
    object_path: str, 
    object_data: any,
    object_metadata: any
) -> any:
    success = False
    if is_swift_client(storage_client = storage_client):
        formatted_data = pickle.dumps(object_data)
        formatted_metadata = set_encoded_metadata(
            used_client = 'swift',
            object_metadata = object_metadata
        )

        success = swift_create_or_update_object(
            swift_client = storage_client,
            bucket_name = bucket_name,
            object_path = object_path,
            object_data = formatted_data,
            object_metadata = formatted_metadata
        )
    return success
# Created and works
def format_bucket_metadata(
    used_client: str,
    bucket_metadata: any
) -> any:
    formatted_metadata = {}
    if used_client == 'swift':
        relevant_values = {
            'x-container-object-count': 'object-count',
            'x-container-bytes-used-actual': 'used-bytes',
            'last-modified': 'date',
            'content-type': 'type'
        }
        formatted_metadata = {}
        for key,value in bucket_metadata.items():
            if key in relevant_values:
                formatted_key = relevant_values[key]
                formatted_metadata[formatted_key] = value
    return formatted_metadata
# Created and works
def format_bucket_objects(
    used_client: str,
    bucket_objects: any
) -> any:
    formatted_objects = {}
    if used_client == 'swift':
        for bucket_object in bucket_objects:
            formatted_object_metadata = {
                'hash': 'id',
                'bytes': 'used-bytes',
                'last_modified': 'date'
            }
            object_key = None
            object_metadata = {}
            for key, value in bucket_object.items():
                if key == 'name':
                    object_key = value
                if key in formatted_object_metadata:
                    formatted_key = formatted_object_metadata[key]
                    object_metadata[formatted_key] = value
            formatted_objects[object_key] = object_metadata
    return formatted_objects
# Created and works
def format_bucket_info(
    used_client: str,
    bucket_info: any
) -> any:
    bucket_metadata = {}
    bucket_objects = {}
    if used_client == 'swift':
        bucket_metadata = format_bucket_metadata(
            used_client = used_client,
            bucket_metadata = bucket_info['metadata']
        )
        bucket_objects = format_bucket_objects(
            used_client = used_client,
            bucket_objects = bucket_info['objects']
        )
    return {'metadata': bucket_metadata, 'objects': bucket_objects} 
# Created and works
def get_bucket_info(
    storage_client: any,
    bucket_name: str
) -> any:
    bucket_info = {}
    if is_swift_client(storage_client = storage_client):
        unformatted_bucket_info = swift_check_bucket(
            swift_client = storage_client,
            bucket_name = bucket_name
        )
        bucket_info = format_bucket_info(
            used_client = 'swift',
            bucket_info = unformatted_bucket_info
        )
    return bucket_info
# Created and works
def format_container_info(
    used_client: str,
    container_info: any
) -> any:
    formatted_container_info = {}
    if used_client == 'swift':
        for bucket in container_info:
            bucket_name = bucket['name']
            bucket_count = bucket['count']
            bucket_size = bucket['bytes']
            formatted_container_info[bucket_name] = {
                'amount': bucket_count,
                'size': bucket_size
            }
    return formatted_container_info
# Created and works
def get_container_info( 
    storage_client: any
) -> any:
    container_info = {}
    if is_swift_client(storage_client = storage_client):
        unformatted_container_info = swift_list_buckets(
            swift_client = storage_client 
        )
        container_info = format_container_info(
            used_client = 'swift',
            container_info = unformatted_container_info
        )
    return container_info

## Object Functions

In [9]:
# 4-2-3

# Created and works
def set_object_path(
    object_name: str,
    path_replacers: any,
    path_names: any
):
    object_paths = {
        'root': 'name',
        'code': 'CODE/name',
        'slurm': 'CODE/SLURM/name',
        'ray': 'CODE/RAY/name',
        'data': 'DATA/name',
        'artifacts': 'ARTIFACTS/name',
        'time': 'TIMES/name'
    }

    i = 0
    path_split = object_paths[object_name].split('/')
    for name in path_split:
        if name in path_replacers:
            replacer = path_replacers[name]
            if 0 < len(replacer):
                path_split[i] = replacer
        i = i + 1
    
    if not len(path_names) == 0:
        path_split.extend(path_names)

    object_path = '/'.join(path_split)
    print('Used object path:' + str(object_path))
    return object_path
# created and works
def setup_storage(
    storage_parameters: any
) -> any:
    storage_client = setup_storage_client(
        storage_parameters = storage_parameters
    ) 
    
    storage_name = set_bucket_names(
       storage_parameters = storage_parameters
    )
    
    return storage_client, storage_name
# Created and works
def check_object(
    storage_client: any,
    bucket_name: str,
    object_name: str,
    path_replacers: any,
    path_names: any
) -> bool:
    object_path = set_object_path(
        object_name = object_name,
        path_replacers = path_replacers,
        path_names = path_names
    )
    # Consider making these functions object storage agnostic
    object_metadata = check_object_metadata(
        storage_client = storage_client,
        bucket_name = bucket_name,
        object_path = object_path
    )
    object_metadata['path'] = object_path
    return object_metadata
# Created and works
def get_object(
    storage_client: any,
    bucket_name: str,
    object_name: str,
    path_replacers: any,
    path_names: any
) -> any:
    checked_object = check_object(
        storage_client = storage_client,
        bucket_name = bucket_name,
        object_name = object_name,
        path_replacers = path_replacers,
        path_names = path_names
    )

    object_data = None
    if not len(checked_object['general-meta']) == 0:
        # Consider making these functions object storage agnostic
        object_data = get_object_content(
            storage_client = storage_client,
            bucket_name = bucket_name,
            object_path = checked_object['path']
        )

    return object_data
# Created and Works
def set_object(
    storage_client: any,
    bucket_name: str,
    object_name: str,
    path_replacers: any,
    path_names: any,
    overwrite: bool,
    object_data: any,
    object_metadata: any
):
    checked_object = check_object(
        storage_client = storage_client,
        bucket_name = bucket_name,
        object_name = object_name,
        path_replacers = path_replacers,
        path_names = path_names
    )
    
    perform = True
    if not len(checked_object['general-meta']) == 0 and not overwrite:
        perform = False
    
    if perform:
        create_or_update_object(
            storage_client = storage_client,
            bucket_name = bucket_name,
            object_path = checked_object['path'],
            object_data = object_data,
            object_metadata = object_metadata
        )
# Created and works
def check_bucket(
    storage_client: any,
    bucket_name: str
) -> any:
    return get_bucket_info(
        storage_client = storage_client,
        bucket_name = bucket_name
    )
# Created and works
def check_buckets(
    storage_client: any
) -> any:
    return get_container_info( 
        storage_client = storage_client
    )

## Metadata Function

In [10]:
# Created and works
def general_object_metadata():
    general_object_metadata = {
        'version': 1
    }
    return general_object_metadata

## Access Functions

In [11]:
def get_storage_parameters(
    env_path: str,
    auth_url: str,
    pre_auth_url: str,
    auth_version: str,
    bucket_prefix: str,
    ice_id: str,
    user: str
):
    env_config = Config(RepositoryEnv(env_path))
    swift_auth_url = auth_url
    swift_user = env_config.get('CSC_USERNAME')
    swift_key = env_config.get('CSC_PASSWORD')
    swift_project_name = env_config.get('CSC_PROJECT_NAME')
    swift_user_domain_name = env_config.get('CSC_USER_DOMAIN_NAME')
    swift_project_domain_name = env_config.get('CSC_USER_DOMAIN_NAME')

    loader = loading.get_plugin_loader('password')
    auth = loader.load_from_options(
        auth_url = swift_auth_url,
        username = swift_user,
        password = swift_key,
        project_name = swift_project_name,
        user_domain_name = swift_user_domain_name,
        project_domain_name = swift_project_domain_name
    )

    keystone_session = session.Session(
        auth = auth
    )
    swift_token = keystone_session.get_token()

    swift_pre_auth_url = pre_auth_url
    swift_auth_version = auth_version

    storage_parameters = {
        'bucket-prefix': bucket_prefix,
        'ice-id': ice_id,
        'user': user,
        'used-client': 'swift',
        'pre-auth-url': str(swift_pre_auth_url),
        'pre-auth-token': str(swift_token),
        'user-domain-name': str(swift_user_domain_name),
        'project-domain-name': str(swift_project_domain_name),
        'project-name': str(swift_project_name),
        'auth-version': str(swift_auth_version)
    }

    return storage_parameters

## Gaining Storage Access

In [12]:
env_absolute_path = '/home/sfniila/.ssh/.env'

In [13]:
storage_parameters = get_storage_parameters(
    env_path = env_absolute_path,
    auth_url = 'https://pouta.csc.fi:5001/v3',
    pre_auth_url = 'https://a3s.fi:443/swift/v1/AUTH_6698ff90e6704a74930c33d6b66f1b5b',
    auth_version = '3',
    bucket_prefix = 'integration',
    ice_id = 's0-c0-u1',
    user = 'user@example.com'
)

storage_client, storage_names = setup_storage(
    storage_parameters = storage_parameters
)

In [10]:
storage_names

['integration-forwarder-s0-c0-u1',
 'integration-preprocessor-s0-c0-u1',
 'integration-submitter-s0-c0-u1-user-example-com',
 'integration-pipeline-s0-c0-u1-user-example-com',
 'integration-experiment-s0-c0-u1-user-example-com']

## Tree Functions

In [103]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
import re

def tree_extract_imports(
    node: any, 
    code_text: str
) -> any:
    imports = []
    if node.type == 'import_statement' or node.type == 'import_from_statement':
        start_byte = node.start_byte
        end_byte = node.end_byte
        imports.append(code_text[start_byte:end_byte].decode('utf8'))
    for child in node.children:
        imports.extend(tree_extract_imports(child, code_text))
    return imports

def tree_extract_dependencies(
    node: any, 
    code_text: str
) -> any:
    dependencies = []
    for child in node.children:
        if child.type == 'call':
            dependency_name = child.child_by_field_name('function').text.decode('utf8')
            dependencies.append(dependency_name)
        dependencies.extend(tree_extract_dependencies(child, code_text))
    return dependencies

def tree_extract_code_and_dependencies(
    node: any,
    code_text: str
) -> any:
    code = {}
    if not node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        code = code_text[start_byte:end_byte].decode('utf8')
        dependencies = tree_extract_dependencies(node, code_text)
        code = {
            'name': 'global',
            'code': code,
            'dependencies': dependencies
        }
    return code

def tree_extract_functions_and_dependencies(
    node: any, 
    code_text: str
) -> any:
    functions = []
    if node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name').text.decode('utf8')
        code = code_text[start_byte:end_byte].decode('utf8')
        dependencies = tree_extract_dependencies(node, code_text)
        functions.append({
            'name': name,
            'code': code,
            'dependencies': dependencies
        })
    for child in node.children:
        functions.extend(tree_extract_functions_and_dependencies(child, code_text))
    return functions

def tree_get_used_imports(
    general_imports: any,
    function_dependencies: any
) -> any:
    parsed_imports = {}
    for code_import in general_imports:
        import_factors = code_import.split('import')[-1].replace(' ', '')
        import_factors = import_factors.split(',')
    
        for factor in import_factors:
            if not factor in parsed_imports:
                parsed_imports[factor] = code_import.split('import')[0] + 'import ' + factor
            
    relevant_imports = {}
    for dependency in function_dependencies:
        initial_term = dependency.split('.')[0]
    
        if not initial_term in relevant_imports:
            if initial_term in parsed_imports:
                relevant_imports[initial_term] = parsed_imports[initial_term]
    
    used_imports = []
    for name, code in relevant_imports.items():
        used_imports.append(code)

    return used_imports

def tree_get_used_functions(
    general_functions: any,
    function_dependencies: any
): 
    used_functions = []
    for related_function_name in function_dependencies:
        for function in general_functions:
            if function['name'] == related_function_name:
                used_functions.append('from ice import ' + function['name'])
    return used_functions

def tree_create_code_document(
    code_imports: any,
    code_functions: any,
    function_item: any
) -> any:
    used_imports = tree_get_used_imports(
        general_imports = code_imports,
        function_dependencies = function_item['dependencies']
    )

    used_functions = tree_get_used_functions(
        general_functions = code_functions,
        function_dependencies = function_item['dependencies']
    )
    
    document = {
        'imports': used_imports,
        'functions': used_functions,
        'name': function_item['name'],
        'dependencies': function_item['dependencies'],
        'code': function_item['code']
    }
    
    return document
     
def tree_format_code_document(
    code_document: any
) -> any:
    formatted_document = ''
    for doc_import in code_document['imports']:
        formatted_document += doc_import + '\n'

    for doc_functions in code_document['functions']:
        formatted_document += doc_functions + '\n'

    formatted_document += 'code dependencies\n'

    for doc_dependency in code_document['dependencies']:
        formatted_document += doc_dependency + '\n'
    
    for line in code_document['code'].splitlines():
        if not bool(line.strip()):
            continue
        doc_code = re.sub(r'#.*','', line)
        if not bool(doc_code.strip()):
            continue
        formatted_document += doc_code + '\n'    
    return formatted_document

def tree_create_python_code_and_function_documents(
    code_document: any
):
    PY_LANGUAGE = Language(tspython.language())
    parser = Parser(PY_LANGUAGE)
   
    tree = parser.parse(
        bytes(
            code_document,
            "utf8"
        )
    )

    root_node = tree.root_node
    code_imports = tree_extract_imports(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    code_global = tree_extract_code_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    #print(code_global)

    code_functions = tree_extract_functions_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )
    
    initial_documents = []

    for item in code_global:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    for item in code_functions:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    formatted_documents = []
    for document in initial_documents:
        formatted_document = tree_format_code_document(
            code_document = document
        )
        formatted_documents.append(formatted_document)
    return formatted_documents


## Mongo Functions

In [2]:
from pymongo import MongoClient as mc

def mongo_is_client(
    storage_client: any
) -> any:
    return isinstance(storage_client, mc.Connection)

def mongo_setup_client(
    username: str,
    password: str,
    address: str,
    port: str
) -> any:
    connection_prefix = 'mongodb://(username):(password)@(address):(port)/'
    connection_address = connection_prefix.replace('(username)', username)
    connection_address = connection_address.replace('(password)', password)
    connection_address = connection_address.replace('(address)', address)
    connection_address = connection_address.replace('(port)', port)
    mongo_client = mc(
        host = connection_address
    )
    return mongo_client

def mongo_get_database(
    mongo_client: any,
    database_name: str
) -> any:
    try:
        database = mongo_client[database_name]
        return database
    except Exception as e:
        return None

def mongo_check_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database_exists = database_name in mongo_client.list_database_names()
        return database_exists
    except Exception as e:
        return False

def mongo_list_databases(
    mongo_client: any
) -> any:
    try:
        databases = mongo_client.list_database_names()
        return databases
    except Exception as e:
        return []

def mongo_remove_database(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        mongo_client.drop_database(database_name)
        return True
    except Exception as e:
        return False

def mongo_get_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collection = database[collection_name]
        return collection
    except Exception as e:
        return None
    
def mongo_check_collection(
    mongo_client: any, 
    database_name: any, 
    collection_name: any
) -> bool:
    try:
        database = mongo_client[database_name]
        collection_exists = collection_name in database.list_collection_names()
        return collection_exists
    except Exception as e:
        return False

def mongo_update_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any, 
    update_query: any
) -> any:
    try:
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_many(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_list_collections(
    mongo_client: any, 
    database_name: str
) -> bool:
    try:
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        collections = database.list_collection_names()
        return collections
    except Exception as e:
        return []

def mongo_remove_collection(
    mongo_client: any, 
    database_name: str, 
    collection_name: str
) -> bool:
    try: 
        database = mongo_get_database(
            mongo_client = mongo_client,
            database_name = database_name
        )
        database.drop_collection(collection_name)
        return True
    except Exception as e:
        return False

def mongo_create_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    document: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.insert_one(document)
        return result
    except Exception as e:
        return None

def mongo_get_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
):
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        document = collection.find_one(filter_query)
        return document
    except Exception as e:
        return None 
    
def mongo_list_documents(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        documents = list(collection.find(filter_query))
        return documents
    except Exception as e:
        return []

def mongo_update_document(
    mongo_client: any, 
    database_name: any, 
    collection_name: any, 
    filter_query: any, 
    update_query: any
) -> any:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.update_one(filter_query, update_query)
        return result
    except Exception as e:
        return None

def mongo_remove_document(
    mongo_client: any, 
    database_name: str, 
    collection_name: str, 
    filter_query: any
) -> bool:
    try: 
        collection = mongo_get_collection(
            mongo_client = mongo_client, 
            database_name = database_name, 
            collection_name = collection_name
        )
        result = collection.delete_one(filter_query)
        return result
    except Exception as e:
        return None

## Qdrant Functions

In [3]:
from qdrant_client import QdrantClient as qc

def qdrant_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, qc.Connection)
    except Exception as e:
        return False

def qdrant_setup_client(
    api_key: str,
    address: str, 
    port: str
) -> any:
    try:
        qdrant_client = qc(
            host = address,
            port = int(port),
            api_key = api_key,
            https = False
        ) 
        return qdrant_client
    except Exception as e:
        return None

def qdrant_create_collection(
    qdrant_client: any, 
    collection_name: str,
    configuration: any
) -> any:
    try:
        result = qdrant_client.create_collection(
            collection_name = collection_name,
            vectors_config = configuration
        )
        return result
    except Exception as e:
        return None

def qdrant_get_collections(
    qdrant_client: any, 
    collection_name: str
) -> any:
    try:
        collection = qdrant_client.get_collection(
            collection_name = collection_name
        )
        return collection
    except Exception as e:
        return None

def qdrant_list_collections(
    qdrant_client: any, 
    database_name: str
) -> any:
    try:
        collections = qdrant_client.get_collections()
        return collections
    except Exception as e:
        return []

def qdrant_remove_collection(
    qdrant_client: any, 
    collection_name: str
) -> bool:
    try:
        qdrant_client.delete_collection(collection_name)
        return True
    except Exception as e:
        return False

def qdrant_upsert_points(
    qdrant_client: qc, 
    collection_name: str,
    points: any
) -> any:
    try:
        results = qdrant_client.upsert(
            collection_name = collection_name, 
            points = points
        )
        return results
    except Exception as e:
        return None

def qdrant_search_vectors(
    qdrant_client: qc,  
    collection_name: str,
    query_vector: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.search(
            collection_name = collection_name,
            query_vector = query_vector,
            limit = limit
        )
        return hits
    except Exception as e:
        return []

def qdrant_remove_vectors(
    qdrant_client: qc,  
    collection_name: str, 
    vectors: str
) -> bool:
    try:
        results = qdrant_client.delete_vectors(
            collection_name = collection_name,
            vectors = vectors
        )
        return results
    except Exception as e:
        print(f"Error removing document: {e}")
        return None


## Meili Functions

In [4]:
import meilisearch as ms

def meili_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, ms.Connection)
    except Exception as e:
        print(e)
        return False

def meili_setup_client(
    host: str, 
    api_key: str
) -> any:
    try:
        meili_client = ms.Client(
            url = host, 
            api_key = api_key
        )
        return meili_client 
    except Exception as e:
        print(e)
        return None

def meili_get_index( 
    meili_client: any, 
    index_name: str
) -> any:
    try:
        index = meili_client.index(
            uid = index_name
        )
        return index
    except Exception as e:
        print(e)
        return None
    
def meili_check_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        meili_client.get_index(
            uid = index_name
        )
        return True
    except Exception as e:
        print(e)
        return False
    
def meili_remove_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        response = meili_client.index(
            index_name = index_name
        ).delete()
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_list_indexes(
    meili_client: any
) -> bool:
    try:
        indexes = meili_client.get_indexes()
        return indexes
    except Exception as e:
        print(e)
        return None

def meili_add_documents(
    meili_client: any, 
    index_name: str, 
    documents: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.add_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_search_documents(
    meili_client: any, 
    index_name: str, 
    query: any, 
    options: any
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.search(
            query = query, 
            options = options
        )
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_update_documents(
    meili_client, 
    index_name, 
    documents
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.update_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_delete_documents(
    meili_client: any, 
    index_name: str, 
    ids: any
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.delete_documents(
            document_ids = ids
        )
        return response
    except Exception as e:
        print(e)
        return None

## Langchain functions

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

def langchain_generate_code_document_chunks(
    language: any,
    chunk_size: int,
    chunk_overlap: int,
    document: any
) -> any:
    splitter = RecursiveCharacterTextSplitter.from_language(
        language = language,
        chunk_size = chunk_size, 
        chunk_overlap = chunk_overlap
    )

    document_chunks = splitter.create_documents([document])
    document_chunks = [doc.page_content for doc in document_chunks]
    return document_chunks

def langchain_generate_document_chunk_embeddings(
    model_name: str,
    document_chunks: any
) -> any:
    embedding_model = HuggingFaceEmbeddings(
        model_name = model_name
    )
    chunk_embeddings = embedding_model.embed_documents(
        texts = document_chunks
    )
    return chunk_embeddings

## NLTK Functions

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt_tab')
nltk.download('stopwords')

def get_code_document_keywords(
    code_document: any
):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    code_text = code_document.lower()
    tokens = word_tokenize(code_text)
    tokens = [token for token in tokens if len(token) > 1]
    
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = list(dict.fromkeys(tokens))

    return tokens

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Document Functions

In [59]:
from bs4 import BeautifulSoup
import markdown
import nbformat
import requests
import requests
import time
import json
import re

def get_document(
    document_url: str,
    document_type: str
) -> any:
    document = None
    response = requests.get(
        url = document_url
    )
    if response.status_code == 200:
        if document_type == 'text':
            document = response.text
        if document_type == 'json':
            document = json.loads(response.text)
        # handle html later
    return document

def scrape_documents(
    url_list: any,
    timeout: int
) -> any:
    documents = []

    text_files = [
        'py',
        'md',
        'yaml',
        'sh'
    ]

    json_files = [
        'ipynb'
    ]
    index = 0
    for url in url_list:
        document = None
        url_split = url.split('/')
        if 'github' in url_split[2]:
            if 'raw' in url_split[2]:
                file_end = url_split[-1].split('.')[-1]
                if file_end in text_files:
                    document = get_document(
                        document_url = url,
                        document_type = 'text' 
                    )
                if file_end in json_files:
                    document = get_document(
                        document_url = url,
                        document_type = 'json' 
                    )
        documents.append(document)
        index = index + 1
        if index < len(url_list):
            time.sleep(timeout)
    return documents

def extract_jupyter_notebook_markdown_and_code(
    notebook_document: any
): 
    notebook_documents = {
        'markdown': [],
        'code': []
    }

    notebook = nbformat.from_dict(notebook_document)

    index = 0
    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            notebook_documents['markdown'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
        if cell.cell_type == 'code':
            notebook_documents['code'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
    
    return notebook_documents

def parse_markdown_into_text(
    markdown_text: any
) -> any:
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, features='html.parser')
    text = soup.get_text()
    text = text.rstrip('\n')
    return text

In [8]:
wanted_urls = [
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/tutorials/demo_notebooks/demo_pipeline/demo-pipeline.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/experiments/article/cloud-hpc/Cloud-HPC-FMNIST-Experiment.ipynb',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/backend/functions/platforms/celery.py',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/deployment/monitoring/kustomization.yaml',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/development/GPUs/gpu-125-test.yaml',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/development/LLMs/deployment/compose/cpu-stack.yaml',
    'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/gpu-setup.sh'
]

In [9]:
scraped_documents = scrape_documents(
    url_list = wanted_urls,
    timeout = 5
)

In [136]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
import re

def tree_extract_imports(
    node: any, 
    code_text: str
) -> any:
    imports = []
    if node.type == 'import_statement' or node.type == 'import_from_statement':
        start_byte = node.start_byte
        end_byte = node.end_byte
        imports.append(code_text[start_byte:end_byte].decode('utf8'))
    for child in node.children:
        imports.extend(tree_extract_imports(child, code_text))
    return imports

def tree_extract_dependencies(
    node: any, 
    code_text: str
) -> any:
    dependencies = []
    for child in node.children:
        if child.type == 'call':
            dependency_name = child.child_by_field_name('function').text.decode('utf8')
            dependencies.append(dependency_name)
        dependencies.extend(tree_extract_dependencies(child, code_text))
    return dependencies

def tree_extract_code_and_dependencies(
    node: any,
    code_text: str
) -> any:
    codes = []
    if not node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name')
        if name is None:
            code = code_text[start_byte:end_byte].decode('utf8')
            if not 'def' in code:
                dependencies = tree_extract_dependencies(node, code_text)
                codes.append({
                    'name': 'global',
                    'code': code,
                    'dependencies': dependencies
                })
    return codes

def tree_extract_functions_and_dependencies(
    node: any, 
    code_text: str
) -> any:
    functions = []
    if node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name').text.decode('utf8')
        code = code_text[start_byte:end_byte].decode('utf8')
        dependencies = tree_extract_dependencies(node, code_text)
        functions.append({
            'name': name,
            'code': code,
            'dependencies': dependencies
        })
    for child in node.children:
        functions.extend(tree_extract_functions_and_dependencies(child, code_text))
    return functions

def tree_get_used_imports(
    general_imports: any,
    function_dependencies: any
) -> any:
    parsed_imports = {}
    for code_import in general_imports:
        import_factors = code_import.split('import')[-1].replace(' ', '')
        import_factors = import_factors.split(',')
    
        for factor in import_factors:
            if not factor in parsed_imports:
                parsed_imports[factor] = code_import.split('import')[0] + 'import ' + factor
            
    relevant_imports = {}
    for dependency in function_dependencies:
        initial_term = dependency.split('.')[0]
    
        if not initial_term in relevant_imports:
            if initial_term in parsed_imports:
                relevant_imports[initial_term] = parsed_imports[initial_term]
    
    used_imports = []
    for name, code in relevant_imports.items():
        used_imports.append(code)

    return used_imports

def tree_get_used_functions(
    general_functions: any,
    function_dependencies: any
): 
    used_functions = []
    for related_function_name in function_dependencies:
        for function in general_functions:
            if function['name'] == related_function_name:
                used_functions.append('from ice import ' + function['name'])
    return used_functions

def tree_create_code_document(
    code_imports: any,
    code_functions: any,
    function_item: any
) -> any:
    used_imports = tree_get_used_imports(
        general_imports = code_imports,
        function_dependencies = function_item['dependencies']
    )

    used_functions = tree_get_used_functions(
        general_functions = code_functions,
        function_dependencies = function_item['dependencies']
    )
    
    document = {
        'imports': used_imports,
        'functions': used_functions,
        'name': function_item['name'],
        'dependencies': function_item['dependencies'],
        'code': function_item['code']
    }
    
    return document
     
def tree_format_code_document(
    code_document: any
) -> any:
    formatted_document = ''
    for doc_import in code_document['imports']:
        formatted_document += doc_import + '\n'

    for doc_functions in code_document['functions']:
        formatted_document += doc_functions + '\n'

    if 0 < len(code_document['dependencies']):
        formatted_document += 'code dependencies\n'

        for doc_dependency in code_document['dependencies']:
            formatted_document += doc_dependency + '\n'

    if code_document['name'] == 'global':
        formatted_document += code_document['name'] + ' code\n'
    else:
        formatted_document += 'function ' + code_document['name'] + ' code\n'
    
    for line in code_document['code'].splitlines():
        if not bool(line.strip()):
            continue
        doc_code = re.sub(r'#.*','', line)
        if not bool(doc_code.strip()):
            continue
        formatted_document += doc_code + '\n'    
    return formatted_document

def tree_create_python_code_and_function_documents(
    code_document: any
):
    PY_LANGUAGE = Language(tspython.language())
    parser = Parser(PY_LANGUAGE)
   
    tree = parser.parse(
        bytes(
            code_document,
            "utf8"
        )
    )

    root_node = tree.root_node
    code_imports = tree_extract_imports(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    code_global = tree_extract_code_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )

    code_functions = tree_extract_functions_and_dependencies(
        root_node, 
        bytes(
            code_document, 
            'utf8'
        )
    )
    
    initial_documents = []
    for item in code_global:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    for item in code_functions:
        document = tree_create_code_document(
            code_imports = code_imports,
            code_functions = code_functions,
            function_item = item
        )  
        initial_documents.append(document)

    formatted_documents = []
    seen_functions = []
    for document in initial_documents:
        if not document['name'] == 'global':
            if document['name'] in seen_functions:
                continue
        
        print(document['name'])
        formatted_document = tree_format_code_document(
            code_document = document
        )

        formatted_documents.append(formatted_document)
        seen_functions.append(document['name'])
    return formatted_documents


In [162]:
def create_notebook_documents(
    notebook_document: any
):
    notebook_documents = extract_jupyter_notebook_markdown_and_code(
        notebook_document = notebook_document
    )

    markdown_document = ''
    markdown_ids = []
    for block in notebook_documents['markdown']:
        joined_text = ''.join(block['data'])
        markdown_text = parse_markdown_into_text(
            markdown_document = joined_text
        )
        markdown_document += markdown_text + '\n\n'
        markdown_ids.append(block['id'])
    
    code_documents = []
    code_ids = []
    for block in notebook_documents['code']:
        joined_code = ''.join(block['data'])
        block_code_documents = tree_create_python_code_and_function_documents(
            code_document = joined_code
        )
        
        if len(block_code_documents) == 0:
            #print(block['data'])
            code_documents.extend(block['data'])
        else:
            code_documents.extend(block_code_documents)
        code_ids.append(block['id'])
    
    formatted_documents = {
        'markdown': markdown_document,
        'code': code_documents
    }
    
    return formatted_documents

In [179]:
from bs4 import BeautifulSoup
import markdown
import nbformat
import requests
import requests
import time
import json
import re

def get_document(
    document_url: str,
    document_type: str
) -> any:
    document = None
    response = requests.get(
        url = document_url
    )
    if response.status_code == 200:
        if document_type == 'text':
            document = response.text
        if document_type == 'json':
            document = json.loads(response.text)
        # handle html later
    return document

def scrape_documents(
    url_list: any,
    timeout: int
) -> any:
    documents = []

    text_files = [
        'py',
        'md',
        'yaml',
        'sh'
    ]

    json_files = [
        'ipynb'
    ]
    index = 0
    for url in url_list:
        document = None
        url_split = url.split('/')
        if 'github' in url_split[2]:
            if 'raw' in url_split[2]:
                file_end = url_split[-1].split('.')[-1]
                if file_end in text_files:
                    document = get_document(
                        document_url = url,
                        document_type = 'text' 
                    )
                if file_end in json_files:
                    document = get_document(
                        document_url = url,
                        document_type = 'json' 
                    )
        documents.append(document)
        index = index + 1
        if index < len(url_list):
            time.sleep(timeout)
    return documents

def extract_jupyter_notebook_markdown_and_code(
    notebook_document: any
): 
    notebook_documents = {
        'markdown': [],
        'code': []
    }

    notebook = nbformat.from_dict(notebook_document)

    index = 0
    for cell in notebook.cells:
        if cell.cell_type == 'markdown':
            notebook_documents['markdown'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
        if cell.cell_type == 'code':
            notebook_documents['code'].append({
                'id': index,
                'data': cell.source
            })
            index += 1
    
    return notebook_documents
    
def parse_markdown_into_text(
    markdown_text: any
) -> any:
    html = markdown.markdown(markdown_text)
    soup = BeautifulSoup(html, features='html.parser')
    text = soup.get_text()
    text = text.rstrip('\n')
    return text

def create_notebook_documents(
    notebook_document: any
):
    notebook_documents = extract_jupyter_notebook_markdown_and_code(
        notebook_document = notebook_document
    )

    markdown_document = ''
    markdown_ids = []
    for block in notebook_documents['markdown']:
        joined_text = ''.join(block['data'])
        markdown_text = parse_markdown_into_text(
            markdown_text = joined_text
        )
        markdown_document += markdown_text + '\n\n'
        markdown_ids.append(block['id'])
    # There can be duplicate functions
    # These needs to be removed by preprocessing
    code_documents = []
    seen_function_names = []
    code_ids = []
    for block in notebook_documents['code']:
        joined_code = ''.join(block['data'])
        block_code_documents = tree_create_python_code_and_function_documents(
            code_document = joined_code
        )

        code_doc_index = 0
        for code_doc in block_code_documents:
            row_split = code_doc.split('\n')
            for row in row_split:
                if 'function' in row and 'code' in row:
                    function_name = row.split(' ')[1]
                    if not function_name in seen_function_names:
                        seen_function_names.append(function_name)
                    else:
                        print(function_name)
                        del block_code_documents[code_doc_index]
            code_doc_index += 1
        
        if 0 < len(block_code_documents):
            code_documents.extend(block_code_documents)
            code_ids.append(block['id'])
    
    formatted_documents = {
        'markdown': markdown_document,
        'code': code_documents
    }
    
    return formatted_documents

In [180]:
formatted_notebook_documents = create_notebook_documents(
    notebook_document = scraped_documents[0]
)

get_istio_auth_session


In [115]:
print(repr(formatted_notebook_documents['code'][0]))

'code dependencies\n%%bash\npip install kfp~=1.8.14\n'


In [91]:
print(repr(formatted_notebook_documents['code'][4]))



In [181]:
for code in formatted_notebook_documents['code']:
    print(code)
    print('')

global code
%%bash
pip install kfp~=1.8.14


code dependencies
global code
import kfp
import kfp.dsl as dsl
from kfp.aws import use_aws_secret
from kfp.v2.dsl import (
    component,
    Input,
    Output,
    Dataset,
    Metrics,
    Artifact,
    Model
)


import requests
from urllib.parse import urlsplit
import re
code dependencies
requests.Session
s.get
RuntimeError
len
urlsplit
re.search
redirect_url_obj._replace
re.sub
re.search
redirect_url_obj.geturl
s.get
redirect_url_obj.geturl
RuntimeError
redirect_url_obj.geturl
s.post
len
RuntimeError
"; ".join
function get_istio_auth_session code
def get_istio_auth_session(url: str, username: str, password: str) -> dict:
    """
    Determine if the specified URL is secured by Dex and try to obtain a session cookie.
             (we default default to using `staticPasswords` if both are enabled)
    :param url: Kubeflow server URL, including protocol
    :param username: Dex `staticPasswords` or `LDAP` username
    :param password: Dex `

In [12]:
notebook_documents = extract_jupyter_notebook_markdown_and_code(
    notebook_document = scraped_documents[0]
)

In [13]:
notebook_documents

{'markdown': [{'id': 0, 'data': ['# Demo KFP pipeline']},
  {'id': 1, 'data': ['Install requirements:']},
  {'id': 3, 'data': ['Imports:']},
  {'id': 5,
   'data': ['## 1. Connect to client\n',
    '\n',
    "The default way of accessing Kubeflow is via port-forward. This enables you to get started quickly without imposing any requirements on your environment. Run the following to port-forward Istio's Ingress-Gateway to local port `8080`:\n",
    '\n',
    '```sh\n',
    'kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80\n',
    '```']},
  {'id': 8,
   'data': ['## 2. Components\n',
    '\n',
    'There are different ways to define components in KFP. Here, we use the **@component** decorator to define the components as Python function-based components.\n',
    '\n',
    'The **@component** annotation converts the function into a factory function that creates pipeline steps that execute this function. This example also specifies the base container image to run you co

In [15]:
notebook_documents['markdown'][0]['data']

['# Demo KFP pipeline']

In [37]:
parsed_markdown = parse_markdown_into_text(
    markdown_document = notebook_documents['markdown'][4]['data']
)

In [38]:
print(parsed_markdown) 

2. Components
There are different ways to define components in KFP. Here, we use the @component decorator to define the components as Python function-based components.
The @component annotation converts the function into a factory function that creates pipeline steps that execute this function. This example also specifies the base container image to run you component in.


## Document storage

In [37]:
used_document_url = 'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/backend/functions/platforms/celery.py'
document_content = get_document(
    document_url = used_document_url
)
code_documents = parse_python_code_chunks(
    document_code = document_content
)

In [11]:
mongo_client = mongo_setup_client(
    username = 'mongo123',
    password = 'mongo456',
    address = '127.0.0.1',
    port = '27017'
)

In [53]:
document_database_name = 'llm-rag-code-functions'
document_collection_name = 'celery-py'

In [59]:
created_collection = []
collection_ids = {}
index = 1
for code_document in code_documents:
    result = mongo_create_document(
        mongo_client = mongo_client,
        database_name = document_database_name,
        collection_name = document_collection_name,
        document = {
            'data': code_document
        }
    )
    collection_ids[str(index)] = str(result.inserted_id)
    index = index + 1

In [67]:
collection_ids

{'1': '66d184973fc211d7909f628d',
 '2': '66d184973fc211d7909f628e',
 '3': '66d184973fc211d7909f628f',
 '4': '66d184973fc211d7909f6290',
 '5': '66d184973fc211d7909f6291',
 '6': '66d184973fc211d7909f6292',
 '7': '66d184973fc211d7909f6293',
 '8': '66d184973fc211d7909f6294',
 '9': '66d184973fc211d7909f6295'}

## RAG Preparation

In [68]:
fetched_documents = mongo_list_documents(
    mongo_client = mongo_client,
    database_name = document_database_name,
    collection_name = document_collection_name,
    filter_query = {}
)

In [69]:
example_document_id = str(fetched_documents[0]['_id'])
example_document = fetched_documents[0]['data']

In [70]:
print(example_document_id)

66d184973fc211d7909f628d


In [71]:
print(example_document)

import os
import shutil
code dependencies
os.path.abspath
os.path.exists
shutil.rmtree
os.makedirs
open
def setup_celery_logging():
    log_directory = os.path.abspath('logs')
    if os.path.exists(log_directory):
        shutil.rmtree(log_directory)
    os.makedirs(log_directory, exist_ok=True)
    log_path = log_directory + '/backend.log'
    with open(log_path, 'w') as f:
        pass
    return log_path



## Generating Document Vector Embeddings

In [44]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [45]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.PYTHON, 
    chunk_size = 50, 
    chunk_overlap = 0
)
document_chunks = python_splitter.create_documents([example_document])
document_chunks = [doc.page_content for doc in document_chunks]

In [46]:
document_chunks

['import os\nimport shutil\ncode dependencies',
 'os.path.abspath\nos.path.exists\nshutil.rmtree',
 'os.makedirs\nopen',
 'def setup_celery_logging():',
 "log_directory = os.path.abspath('logs')",
 'if os.path.exists(log_directory):',
 'shutil.rmtree(log_directory)',
 'os.makedirs(log_directory, exist_ok=True)',
 "log_path = log_directory + '/backend.log'",
 "with open(log_path, 'w') as f:\n        pass",
 'return log_path']

In [47]:
from langchain_huggingface import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)



In [48]:
example_embeddings = embedding_model.embed_documents(document_chunks)

In [83]:
len(example_embeddings[0])

384

## Storing Document Vector Embeddings

In [84]:
vector_collection_name = document_database_name + '-vectors'

In [98]:
qdrant_client = qdrant_setup_client(
    api_key = 'qdrant_key',
    address = '127.0.0.1', 
    port = '6333'
)



In [99]:
from qdrant_client.models import VectorParams, Distance

created = qdrant_create_collection(
    qdrant_client = qdrant_client,
    collection_name = vector_collection_name,
    configuration = VectorParams(
          size = len(example_embeddings[0]), 
          distance = Distance.COSINE
    )
)

In [100]:
created

True

In [101]:
import numpy as np
from qdrant_client.models import PointStruct
example_document_points = []
index = 0
for doc_chunk in document_chunks:
    chunk_point = PointStruct(
        id = index, 
        vector = example_embeddings[index],
        payload = {
            'database': document_database_name,
            'collection': document_collection_name,
            'id': example_document_id, 
            'data': doc_chunk
        }
    )
    example_document_points.append(chunk_point)
    index = index + 1

In [102]:
points_stored = qdrant_upsert_points(
    qdrant_client = qdrant_client, 
    collection_name = vector_collection_name,
    points = example_document_points
)

In [103]:
points_stored

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## Generating Document Key Words

In [152]:
print(example_document_id)

66d184973fc211d7909f628d


In [104]:
example_document

"import os\nimport shutil\ncode dependencies\nos.path.abspath\nos.path.exists\nshutil.rmtree\nos.makedirs\nopen\ndef setup_celery_logging():\n    log_directory = os.path.abspath('logs')\n    if os.path.exists(log_directory):\n        shutil.rmtree(log_directory)\n    os.makedirs(log_directory, exist_ok=True)\n    log_path = log_directory + '/backend.log'\n    with open(log_path, 'w') as f:\n        pass\n    return log_path\n"

In [119]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sfniila/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [116]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [126]:
# Lower, tokenize and remove '('
text = example_document.lower()
tokens = word_tokenize(example_document)
tokens = [token for token in tokens if len(token) > 1]
print(tokens)

['import', 'os', 'import', 'shutil', 'code', 'dependencies', 'os.path.abspath', 'os.path.exists', 'shutil.rmtree', 'os.makedirs', 'open', 'def', 'setup_celery_logging', 'log_directory', 'os.path.abspath', "'logs", 'if', 'os.path.exists', 'log_directory', 'shutil.rmtree', 'log_directory', 'os.makedirs', 'log_directory', 'exist_ok=True', 'log_path', 'log_directory', "'/backend.log", 'with', 'open', 'log_path', 'as', 'pass', 'return', 'log_path']


In [128]:
# Stem tokens and remove stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
tokens = [stemmer.stem(token) for token in tokens]
print(tokens)

['import', 'os', 'import', 'shutil', 'code', 'depend', 'os.path.abspath', 'os.path.exist', 'shutil.rmtre', 'os.makedir', 'open', 'def', 'setup_celery_log', 'log_directori', 'os.path.abspath', "'log", 'os.path.exist', 'log_directori', 'shutil.rmtre', 'log_directori', 'os.makedir', 'log_directori', 'exist_ok=tru', 'log_path', 'log_directori', "'/backend.log", 'open', 'log_path', 'pass', 'return', 'log_path']


In [129]:
# Remove duplicates while keeping the order
tokens = list(dict.fromkeys(tokens))
print(tokens)

['import', 'os', 'shutil', 'code', 'depend', 'os.path.abspath', 'os.path.exist', 'shutil.rmtre', 'os.makedir', 'open', 'def', 'setup_celery_log', 'log_directori', "'log", 'exist_ok=tru', 'log_path', "'/backend.log", 'pass', 'return']


In [153]:
document_keywords = [tokens]
keywords_stored = []
for document_tokens in document_keywords:
    keywords_stored.append({
        'database': document_database_name,
        'collection': document_collection_name,
        'id': example_document_id, 
        'data': document_tokens
    })

In [155]:
keywords_stored 

[{'database': 'llm-rag-code-functions',
  'collection': 'celery-py',
  'id': '66d184973fc211d7909f628d',
  'data': ['import',
   'os',
   'shutil',
   'code',
   'depend',
   'os.path.abspath',
   'os.path.exist',
   'shutil.rmtre',
   'os.makedir',
   'open',
   'def',
   'setup_celery_log',
   'log_directori',
   "'log",
   'exist_ok=tru',
   'log_path',
   "'/backend.log",
   'pass',
   'return']}]

## Storing Document Key Words

In [164]:
meili_client = meili_setup_client(
    host = 'http://127.0.0.1:7700', 
    api_key = 'meili_key'
)

In [169]:
keyword_index_name = document_database_name + '-keywords'
#keyword_index_name = keyword_index_name.replace('-', '')

In [170]:
stored = meili_add_documents(
    meili_client = meili_client,
    index_name = keyword_index_name,
    documents = keywords_stored
)

In [171]:
stored

TaskInfo(task_uid=2, index_uid='llm-rag-code-functions-keywords', status='enqueued', type='documentAdditionOrUpdate', enqueued_at=datetime.datetime(2024, 8, 30, 10, 1, 28, 842892))

In [88]:
docs_text[0]

'import os\nimport shutil\ncode dependencies'

In [98]:
hits = qdrant_search_vectors(
    qdrant_client = qdrant_client,  
    collection_name = 'llm-rag-vectors',
    query_vector = example_embeddings[0],
    limit = 20
)

In [99]:
hits

[ScoredPoint(id=1, version=0, score=1.0, payload={'data': 'import os\nimport shutil\ncode dependencies', 'order': 1}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=2, version=0, score=0.32488102, payload={'data': 'os.path.abspath\nos.path.exists\nshutil.rmtree', 'order': 2}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3, version=0, score=0.2667464, payload={'data': 'os.makedirs\nopen', 'order': 3}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=7, version=0, score=0.22600092, payload={'data': 'shutil.rmtree(log_directory)', 'order': 7}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=8, version=0, score=0.20714475, payload={'data': 'os.makedirs(log_directory, exist_ok=True)', 'order': 8}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=6, version=0, score=0.16657859, payload={'data': 'if os.path.exists(log_directory):', 'order': 6}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=4, v

In [89]:
docs_text[-1]

'return log_path'

In [90]:
hits = qdrant_search_vectors(
    qdrant_client = qdrant_client,  
    collection_name = 'llm-rag-vectors',
    query_vector = example_embeddings[-1],
    limit = 3
)

In [91]:
hits

[ScoredPoint(id=11, version=0, score=1.0000001, payload={'data': 'return log_path'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=9, version=0, score=0.77037394, payload={'data': "log_path = log_directory + '/backend.log'"}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=10, version=0, score=0.69867, payload={'data': "with open(log_path, 'w') as f:\n        pass"}, vector=None, shard_key=None, order_value=None)]

## Misc

In [4]:
fetched_chunks = mongo_list_documents(
    mongo_client = mongo_client,
    database_name = 'llm-rag-chunks',
    collection_name = 'celery-py',
    filter_query = {}
)

In [5]:
example_chunk = fetched_chunks[0]['data']

In [6]:
print(example_chunk)

import os
import shutil
code dependencies
os.path.abspath
os.path.exists
shutil.rmtree
os.makedirs
open
def setup_celery_logging():
    log_directory = os.path.abspath('logs')
    if os.path.exists(log_directory):
        shutil.rmtree(log_directory)
    os.makedirs(log_directory, exist_ok=True)
    log_path = log_directory + '/backend.log'
    with open(log_path, 'w') as f:
        pass
    return log_path



In [31]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [32]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language = Language.PYTHON, 
    chunk_size = 50, 
    chunk_overlap = 0
)
smaller_chunks = python_splitter.create_documents([example_chunk])

In [33]:
smaller_chunks

[Document(page_content='import os\nimport shutil\ncode dependencies'),
 Document(page_content='os.path.abspath\nos.path.exists\nshutil.rmtree'),
 Document(page_content='os.makedirs\nopen'),
 Document(page_content='def setup_celery_logging():'),
 Document(page_content="log_directory = os.path.abspath('logs')"),
 Document(page_content='if os.path.exists(log_directory):'),
 Document(page_content='shutil.rmtree(log_directory)'),
 Document(page_content='os.makedirs(log_directory, exist_ok=True)'),
 Document(page_content="log_path = log_directory + '/backend.log'"),
 Document(page_content="with open(log_path, 'w') as f:\n        pass"),
 Document(page_content='return log_path')]

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

In [34]:
embedding_model = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)

In [12]:
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(
    host = "127.0.0.1", 
    port = 6333
)

In [14]:
qdrant_client.get_collections()

CollectionsResponse(collections=[])

In [17]:
from qdrant_client.models import VectorParams, Distance

if not qdrant_client.collection_exists("llm-rag-vectors"):
   qdrant_client.create_collection(
      collection_name="llm-rag-vectors",
      vectors_config=VectorParams(
          size=100, 
          distance=Distance.COSINE
      ),
   )

In [18]:
import numpy as np
from qdrant_client.models import PointStruct

In [20]:
vectors = np.random.rand(100, 100)
qdrant_client.upsert(
   collection_name="llm-rag-vectors",
   points=[
      PointStruct(
            id=idx,
            vector=vector.tolist(),
            payload={"color": "red", "rand_number": idx % 10}
      )
      for idx, vector in enumerate(vectors)
   ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
query_vector = np.random.rand(100)
hits = qdrant_client.search(
   collection_name="llm-rag-vectors",
   query_vector=query_vector,
   limit=5  # Return 5 closest points
)

In [23]:
hits

[ScoredPoint(id=41, version=0, score=0.81860214, payload={'color': 'red', 'rand_number': 1}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=0, version=0, score=0.8181431, payload={'color': 'red', 'rand_number': 0}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=33, version=0, score=0.81126696, payload={'color': 'red', 'rand_number': 3}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=13, version=0, score=0.8024623, payload={'color': 'red', 'rand_number': 3}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=67, version=0, score=0.7998194, payload={'color': 'red', 'rand_number': 7}, vector=None, shard_key=None, order_value=None)]

In [26]:
from qdrant_client.models import Filter, FieldCondition, Range
hits = qdrant_client.search(
   collection_name="llm-rag-vectors",
   query_vector=query_vector,
   query_filter=Filter(
      must=[  # These conditions are required for search results
            FieldCondition(
               key='rand_number',  # Condition based on values of `rand_number` field.
               range=Range(
                  gte=3  # Select only those results where `rand_number` >= 3
               )
            )
      ]
   ),
   limit=5  # Return 5 closest points
)


In [27]:
hits

[ScoredPoint(id=33, version=0, score=0.81126696, payload={'color': 'red', 'rand_number': 3}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=13, version=0, score=0.8024623, payload={'color': 'red', 'rand_number': 3}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=67, version=0, score=0.7998194, payload={'color': 'red', 'rand_number': 7}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=34, version=0, score=0.79686064, payload={'color': 'red', 'rand_number': 4}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=36, version=0, score=0.7856212, payload={'color': 'red', 'rand_number': 6}, vector=None, shard_key=None, order_value=None)]

In [175]:
import pymongo

In [180]:
document_storage_client = pymongo.MongoClient(
    'mongodb://mongo123:mongo456@127.0.0.1:27017/'
)

In [181]:
print(document_storage_client.list_database_names())

['admin', 'config', 'local']


In [185]:
database = document_storage_client['llm-rag-chunks']

In [186]:
collection = database['celery-py'] 

In [190]:
collection_index = 1
created_collection = []
for code_chunk in code_chunks:
    created_collection.append(
        {'data': code_chunk}
    )
    collection_index += 1

In [191]:
collection.insert_many(created_collection)

InsertManyResult([ObjectId('66d06efb48c12d9454c2bbdc'), ObjectId('66d06efb48c12d9454c2bbdd'), ObjectId('66d06efb48c12d9454c2bbde'), ObjectId('66d06efb48c12d9454c2bbdf'), ObjectId('66d06efb48c12d9454c2bbe0'), ObjectId('66d06efb48c12d9454c2bbe1'), ObjectId('66d06efb48c12d9454c2bbe2'), ObjectId('66d06efb48c12d9454c2bbe3'), ObjectId('66d06efb48c12d9454c2bbe4')], acknowledged=True)

In [193]:
for chunk in collection.find():
    print(chunk['data'])

import os
import shutil
code dependencies
os.path.abspath
os.path.exists
shutil.rmtree
os.makedirs
open
def setup_celery_logging():
    log_directory = os.path.abspath('logs')
    if os.path.exists(log_directory):
        shutil.rmtree(log_directory)
    os.makedirs(log_directory, exist_ok=True)
    log_path = log_directory + '/backend.log'
    with open(log_path, 'w') as f:
        pass
    return log_path

import os
code dependencies
os.path.abspath
open
listed_logs['logs'].append
line.strip
def get_celery_logs(): 
    log_path = os.path.abspath('logs/backend.log')
    listed_logs = {'logs':[]}
    with open(log_path, 'r') as f:
        for line in f:
            listed_logs['logs'].append(line.strip())
    return listed_logs

import os
from celery import Celery
code dependencies
os.environ.get
os.environ.get
os.environ.get
str
str
Celery
def get_celery_instance():
    redis_endpoint = os.environ.get('REDIS_ENDPOINT')
    redis_port = os.environ.get('REDIS_PORT')
    redis_db = os.en

In [81]:
test_url = 'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/applications/article/submitter/backend/functions/platforms/celery.py'
response = requests.get(test_url)
response.raise_for_status()  

In [84]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
import re

PY_LANGUAGE = Language(tspython.language())
parser = Parser(PY_LANGUAGE)
code = response.text
tree = parser.parse(
    bytes(code,"utf8")
)

In [14]:
root_node = tree.root_node

def extract_functions(node, code_text):
    functions = []
    if node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        functions.append(code_text[start_byte:end_byte].decode('utf8'))
    for child in node.children:
        functions.extend(extract_functions(child, code_text))
    return functions

functions = extract_functions(root_node, bytes(code, 'utf8'))

for func in functions:
    print(func)
    print('')

def setup_celery_logging():
    log_directory = os.path.abspath('logs')
    
    if os.path.exists(log_directory):
        shutil.rmtree(log_directory)
    
    os.makedirs(log_directory, exist_ok=True)
    log_path = log_directory + '/backend.log'
    with open(log_path, 'w') as f:
        pass

    return log_path

def get_celery_logs(): 
    log_path = os.path.abspath('logs/backend.log')
    listed_logs = {'logs':[]}
    with open(log_path, 'r') as f:
        for line in f:
            listed_logs['logs'].append(line.strip())
    return listed_logs

def get_celery_instance():
    redis_endpoint = os.environ.get('REDIS_ENDPOINT')
    redis_port = os.environ.get('REDIS_PORT')
    redis_db = os.environ.get('REDIS_DB')
    
    name = 'tasks'
    redis_connection = 'redis://' + redis_endpoint + ':' + str(redis_port) + '/' + str(redis_db)

    celery_app = Celery(
        main = name,
        broker = redis_connection,
        backend = redis_connection
    )

    celery_app.conf.broker_

In [85]:
def extract_imports(node, code_text):
    imports = []
    if node.type == 'import_statement' or node.type == 'import_from_statement':
        start_byte = node.start_byte
        end_byte = node.end_byte
        imports.append(code_text[start_byte:end_byte].decode('utf8'))
    for child in node.children:
        imports.extend(extract_imports(child, code_text))
    return imports

def extract_dependencies(node, code_text):
    dependencies = []
    for child in node.children:
        if child.type == 'call':
            dependency_name = child.child_by_field_name('function').text.decode('utf8')
            dependencies.append(dependency_name)
        dependencies.extend(extract_dependencies(child, code_text))
    return dependencies
    
def extract_functions_and_dependencies(node, code_text):
    functions = []
    if node.type == 'function_definition':
        start_byte = node.start_byte
        end_byte = node.end_byte
        name = node.child_by_field_name('name').text.decode('utf8')
        code = code_text[start_byte:end_byte].decode('utf8')
        dependencies = extract_dependencies(node, code_text)
        functions.append({
            'name': name,
            'code': code,
            'dependencies': dependencies
        })
    for child in node.children:
        functions.extend(extract_functions_and_dependencies(child, code_text))
    return functions

In [101]:
def get_used_imports(
    general_imports: any,
    function_dependencies: any
) -> any:
    parsed_imports = {}
    for code_import in general_imports:
        import_factors = code_import.split('import')[-1].replace(' ', '')
        import_factors = import_factors.split(',')
    
        for factor in import_factors:
            if not factor in parsed_imports:
                parsed_imports[factor] = code_import.split('import')[0] + 'import ' + factor
            
    relevant_imports = {}
    for dependency in function_dependencies:
        initial_term = dependency.split('.')[0]
    
        if not initial_term in relevant_imports:
            if initial_term in parsed_imports:
                relevant_imports[initial_term] = parsed_imports[initial_term]
    
    used_imports = []
    for name, code in relevant_imports.items():
        used_imports.append(code)

    return used_imports

def get_used_functions(
    general_functions: any,
    function_dependencies: any
): 
    used_functions = []
    for related_function_name in function_dependencies:
        for function in general_functions:
            if function['name'] == related_function_name:
                used_functions.append('from ice import ' + function['name'])
    return used_functions

In [104]:
code_imports = extract_imports(root_node, bytes(code, 'utf8'))
code_functions = extract_functions_and_dependencies(root_node, bytes(code, 'utf8'))

document_chunks = []
for item in code_functions:

    used_imports = get_used_imports(
        general_imports = code_imports,
        function_dependencies = item['dependencies']
    )

    used_functions = get_used_functions(
        general_functions = code_functions,
        function_dependencies = item['dependencies']
    )
    
    document_chunk = {
        'imports': used_imports,
        'functions': used_functions,
        'name': item['name'],
        'dependencies': item['dependencies'],
        'code': item['code']
    }
    
    document_chunks.append(document_chunk)  

In [157]:
for chunk in document_chunks:
    formatted_chunk = ''

    for chunk_import in chunk['imports']:
        formatted_chunk += chunk_import + '\n'

    for chunk_functions in chunk['functions']:
        formatted_chunk += chunk_functions + '\n'

    formatted_chunk += 'code dependencies\n'

    for chunk_dependency in chunk['dependencies']:
        formatted_chunk += chunk_dependency + '\n'
    
    for line in chunk['code'].splitlines():
        if not bool(line.strip()):
            continue
        parsed_code = re.sub(r'#.*','', line)
        if not bool(parsed_code.strip()):
            continue
        formatted_chunk += parsed_code + '\n'
        
    print(formatted_chunk)

import os
import shutil
code dependencies
os.path.abspath
os.path.exists
shutil.rmtree
os.makedirs
open
def setup_celery_logging():
    log_directory = os.path.abspath('logs')
    if os.path.exists(log_directory):
        shutil.rmtree(log_directory)
    os.makedirs(log_directory, exist_ok=True)
    log_path = log_directory + '/backend.log'
    with open(log_path, 'w') as f:
        pass
    return log_path

import os
code dependencies
os.path.abspath
open
listed_logs['logs'].append
line.strip
def get_celery_logs(): 
    log_path = os.path.abspath('logs/backend.log')
    listed_logs = {'logs':[]}
    with open(log_path, 'r') as f:
        for line in f:
            listed_logs['logs'].append(line.strip())
    return listed_logs

import os
from celery import Celery
code dependencies
os.environ.get
os.environ.get
os.environ.get
str
str
Celery
def get_celery_instance():
    redis_endpoint = os.environ.get('REDIS_ENDPOINT')
    redis_port = os.environ.get('REDIS_PORT')
    redis_db = os.en

In [77]:
def get_used_imports(
    code_imports: any,
    relevant_imports: any
) -> any:
    parsed_imports = {}
    for code_import in code_imports:
        import_factors = code_import.split('import')[-1].replace(' ', '')
        import_factors = import_factors.split(',')
    
        for factor in import_factors:
            if not factor in parsed_imports:
                parsed_imports[factor] = code_import.split('import')[0] + 'import ' + factor
            
    relevant_imports = {}
    for dependency in code_functions[0]['dependencies']:
        initial_term = dependency.split('.')[0]
    
        if not initial_term in relevant_imports:
            if initial_term in parsed_imports:
                relevant_imports[initial_term] = parsed_imports[initial_term]
    
    used_imports = []
    for name, code in relevant_imports.items():
        used_imports.append(code)

    return used_imports

In [24]:
test_url = 'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/README.md'
response = requests.get(test_url)
response.raise_for_status()  

In [26]:
print(response.text)

# Cloud-HPC integreated OSS MLOps Platform

Welcome to the OSS MLOps Platform, a comprehensive suite designed to streamline your machine learning operations from experimentation to deployment. 

![logos.png](resources/img/logos.png)

This fork provides documentation, applications and notebooks on how to enable the OSS platform run in a cloud virtual machine to utilize the Ray computing framework run in a supercomputer. 

The utilized and intended use enviroment is the CSC infrastructure ecosystem with the tested platforms being [CPouta](https://docs.csc.fi/cloud/pouta/) cloud platform, [Allas](https://docs.csc.fi/data/Allas/) object storage platform and [Mahti](https://docs.csc.fi/computing/) supercomputer platform.

A more indepth explanation for the implemented thesis code and the initial ideas of the article code are found in a master's thesis ['On Integrating Cloud and High Performance Computing Enviroments in Machine Learning Operations'](https://helda.helsinki.fi/items/8b6cc75b-4

In [13]:
test_url = 'https://raw.githubusercontent.com/K123AsJ0k1/cloud-hpc-oss-mlops-platform/main/experiments/article/cloud-hpc/Cloud-HPC-FMNIST-Experiment.ipynb'
response = requests.get(test_url)
response.raise_for_status()  

In [19]:
notebook_node = nbformat.reads(
    response.text, 
    as_version = 4
)

In [23]:
notebook_cells = notebook_node.cells
for cell in notebook_cells:
    if cell.cell_type == 'markdown':
        markdown_content = ''.join(cell.source)
        print(markdown_content)
    if cell.cell_type == 'code':
        code_content = ''.join(cell.source)
        print(code_content)

# Cloud-HPC FMNIST Experiment

In this notebook we go over the necessery manual actions for submitting a training job in Mahti using CPouta, Allas and Mahti integreated OSS. If you want to try this notebook yourself, you need to already have the following:

- MyCSC account
- Project with billing units
- Access to CPouta
- Suitable network rules
- SSH key for CPouta for local and bridge
- Setup Cloud-HPC OSS
- Access to Allas
- Access to Mahti
- SSH setup to Mahti

To begin, please create a python virtual enviroment, install the packages and open this notebook with the following:

```
python3 -m venv exp_venv
source exp_venv/bin/activate
pip install -r exp_req.txt
jupyter notebook
```

This notebook uses the following packages:
- notebook
- matplotlib
- torch
- torchvision
- torchmetrics
- python-decouple
- keystoneauth1
- python-swiftclient
- kfp~=1.8.14
## Data Analysis
import torch
from torchvision import datasets
import torchvision.transforms as T

image_labels = {
    0: 'Top',
   