# Demonstration RAG Pipeline

```
helm install raycluster kuberay/ray-cluster --version 1.0.0 -f ray-values.yaml

ssh -L 7200:localhost:7200 GPU-cpouta
kubectl port-forward svc/express-service 7200:7200 -n llm-pro

ssh -L 7201:localhost:7201 GPU-cpouta
kubectl port-forward svc/qdrant-service 7201:7201 -n llm-pro

ssh -L 7202:localhost:7202 GPU-cpouta
kubectl port-forward svc/meili-service 7202:7202 -n llm-pro

ssh -L 8265:localhost:8265 GPU-cpouta
kubectl port-forward svc/raycluster-kuberay-head-svc 8265:8265

ssh -L 7101:localhost:7101 GPU-cpouta
kubectl port-forward svc/webui-service 7101:7101 -n llm-inf

ssh -L 9001:localhost:9001 GPU-cpouta
kubectl port-forward svc/mlflow-minio-service 9001:9001 -n mlflow
```

## Ray

In [3]:
from ray.job_submission import JobSubmissionClient
from ray.job_submission import JobStatus
import time as t
import json
import requests

def test_url(
    target_url: str,
    timeout: int
) -> bool:
    try:
        response = requests.head(
            url = target_url, 
            timeout = timeout
        )
        if response.status_code == 200:
            return True
        return False
    except requests.ConnectionError:
        return False

def setup_ray(
    services: any,
    timeout: int
):
    start = t.time()
    ray_client = None
    if 0 < len(services):
        ray_dashboard_url = 'http://' + services['ray-dashboard']
        ray_exists = None
        while t.time() - start <= timeout:
            ray_exists = test_url(
                target_url = ray_dashboard_url,
                timeout = 5
            )
            if ray_exists:
                break
            t.sleep(5)
        if ray_exists:
            ray_client = JobSubmissionClient(
                address = ray_dashboard_url
            )
    return ray_client

def submit_ray_job(
    ray_client: any,
    ray_parameters: any,
    ray_job_file: any,
    working_directory: str,
    ray_job_envs: any,
    ray_job_packages: any
) -> any:
    command = "python " + str(ray_job_file)
    if 0 < len(ray_parameters):
        command = command + " '" + json.dumps(ray_parameters) + "'"
    job_id = ray_client.submit_job(
        entrypoint = command,
        runtime_env = {
            'working_dir': str(working_directory),
            'env_vars': ray_job_envs,
            'pip': ray_job_packages
        }
    )
    return job_id

def wait_ray_job(
    ray_client: any,
    ray_job_id: int, 
    waited_status: any,
    timeout: int
) -> any:
    start = t.time()
    job_status = None
    while t.time() - start <= timeout:
        status = ray_client.get_job_status(ray_job_id)
        print(f"status: {status}")
        if status in waited_status:
            job_status = status
            break
        t.sleep(5)
    job_logs = ray_client.get_job_logs(ray_job_id)
    return job_status, job_logs

def ray_job_handler(
    ray_client: any,
    ray_parameters: any,
    ray_job_file: str,
    ray_directory: str,
    ray_job_envs: any,
    ray_job_packages: any,
    timeout: int
) -> bool:
    ray_job_id = submit_ray_job(
        ray_client = ray_client,
        ray_parameters = ray_parameters,
        ray_job_file = ray_job_file,
        working_directory = ray_directory,
        ray_job_envs = ray_job_envs,
        ray_job_packages = ray_job_packages
    )

    print('Ray batch job id: ' + str(ray_job_id))
    
    ray_job_status, ray_job_logs = wait_ray_job(
        ray_client = ray_client,
        ray_job_id = ray_job_id,
        waited_status = {
            JobStatus.SUCCEEDED, 
            JobStatus.STOPPED, 
            JobStatus.FAILED
        }, 
        timeout = timeout
    )
    print('Ray batch job ended:')
    success = True
    if not ray_job_status == JobStatus.SUCCEEDED:
        print('Ray batch job failed')
        success = False
    else:
        print('Ray batch job succeeded')
    print(ray_job_logs)
    return success

  from .autonotebook import tqdm as notebook_tqdm
2024-11-20 14:54:18,329	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Job Functions

## Fetching and Storing Data

In [None]:
from decouple import Config,RepositoryEnv
env_path = '/home/sfniila/.ssh/.env'
env_config = Config(RepositoryEnv(env_path))
github_token = env_config.get('GITHUB_TOKEN')
repository_owner = 'K123AsJ0k1'
repository_name = 'cloud-hpc-oss-mlops-platform'
object_bucket = 'llm-rag'

In [8]:
ray_client = setup_ray(
    services = {
        'ray-dashboard': '127.0.0.1:8265'
    },
    timeout = 10
)

In [18]:
storage_parameters = {
    'mongo-username': 'mongo123',
    'mongo-password': 'mongo456',
    'mongo-address': 'mongodb-service.llm-pro.svc.cluster.local',
    'mongo-port': '27017',
    'minio-endpoint': 'mlflow-minio-service.mlflow.svc.cluster.local:9000',
    'minio-username': 'minioadmin',
    'minio-password': 'minioadmin',
    'object-bucket': object_bucket,
    'object-path': repository_name,
    'fetch-path-prefix': 'paths'
}

In [19]:
process_parameters = {
    'worker-number': 6
}

In [20]:
data_parameters = {
    'github-token': github_token,
    'repository-owner': repository_owner,
    'repository-name': repository_name,
    'replace': 'false',
    'relevant-files': [
        'md',
        'yaml',
        'py',
        'ipynb'
    ],
    'document-type-priority': {
        'md': 1,
        'py': 2,
        'ipynb': 3,
        'yaml': 4
    },
    'batch-size': 20
}

In [21]:
fetch_store_parameters = {
    'process-parameters': process_parameters,
    'storage-parameters': storage_parameters,
    'data-parameters': data_parameters
}

In [23]:
job_status = ray_job_handler(
    ray_client = ray_client,
    ray_parameters = fetch_store_parameters,
    ray_job_file = 'store-rag-data.py',
    ray_directory = '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/ray_step_1',
    ray_job_envs = {},
    ray_job_packages = [
        'pymongo',
        'minio',
        'PyGithub',
        'Markdown',
        'tree-sitter==0.23.0',
        'tree-sitter-python==0.23.0',
        'beautifulsoup4',
        'nbformat'
    ],
    timeout = 20
)

2024-11-18 11:52:09,943	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_188cd878cb8cdcef.zip.
2024-11-18 11:52:09,945	INFO packaging.py:530 -- Creating a file package for local directory '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/ray_step_1'.


Ray batch job id: raysubmit_UbQjwqAB6knp9vF7
status: PENDING
status: RUNNING
status: RUNNING
status: RUNNING
Ray batch job ended:
Ray batch job failed
2024-11-18 01:52:10,038	INFO job_manager.py:529 -- Runtime env is setting up.
Starting ray job
Python version is:3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:27:36) [GCC 11.2.0]
Ray version is:2.38.0
PyGithub version is:2.5.0
PyMongo version is:4.10.1
Markdown version is:3.7
Tree-sitter version is:0.23.0
Tree-sitter-python version is:0.23.0
BeautifulSoup version is:4.12.3
NBformat version is:5.10.4
Running store data
Creating minio client
Minio client created
Getting repository paths
Dividing paths for 6 workers
Fetching paths
Filtering paths
Amount of paths: 2204
Referencing paths
2024-11-18 01:52:12,664	INFO worker.py:1491 -- Using address 10.244.0.68:6379 set in the environment variable RAY_ADDRESS
2024-11-18 01:52:12,664	INFO worker.py:1631 -- Connecting to existing Ray cluster at address: 10.244.0.68:6379...
2024-11-

## CPU Preprocess

In [2]:
ray_client = setup_ray(
    services = {
        'ray-dashboard': '127.0.0.1:8265'
    },
    timeout = 10
)

In [3]:
process_parameters = {
    'worker-number': 6
}

In [4]:
repository_owner = 'K123AsJ0k1'
repository_name = 'cloud-hpc-oss-mlops-platform'
object_bucket = 'llm-rag'

In [5]:
storage_parameters = {
    'object-bucket': object_bucket,
    'object-path': repository_name,
    'vector-collection-prefix': 'embeddings',
    'search-collection-prefix': 'keywords',
    'vector-identity-prefix': 'vector-identities',
    'vector-hash-prefix': 'vector-hashes',
    'search-identity-prefix': 'search-identities',
    'mongo-username': 'mongo123',
    'mongo-password': 'mongo456',
    'mongo-address': 'mongodb-service.llm.svc.cluster.local',
    'mongo-port': '27017',
    'minio-endpoint': 'mlflow-minio-service.mlflow.svc.cluster.local:9000',
    'minio-username': 'minioadmin',
    'minio-password': 'minioadmin',
    'qdrant-key': 'qdrant_key',
    'qdrant-address': 'qdrant-service.llm.svc.cluster.local', 
    'qdrant-port': '7201',
    'meili-key': 'meili_key', 
    'meili-host': 'http://meili-service.llm.svc.cluster.local:7202'
}

In [6]:
data_parameters = {
    'repository-owner': repository_owner,
    'repository-name': repository_name,
    'document-type-priority': {
        'md': 1,
        'py': 2,
        'ipynb': 3,
        'yaml': 4
    },
    'vector-collection-print': 2,
    'search-collection-print': 2,
    'python': {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2'
    },
    'markdown': {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2'
    },
    'yaml': {
        'chunk-size': 50,
        'chunk-overlap': 0,
        'model-name': 'sentence-transformers/all-MiniLM-L6-v2'
    }
}

In [7]:
preprocess_parameters = {
    'process-parameters': process_parameters,
    'storage-parameters': storage_parameters,
    'data-parameters': data_parameters
}

In [8]:
job_status = ray_job_handler(
    ray_client = ray_client,
    ray_parameters = preprocess_parameters,
    ray_job_file = 'preprocess-rag-data.py',
    ray_directory = '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/ray_step_2',
    ray_job_envs = {},
    ray_job_packages = [
        'pymongo',
        'minio',
        'qdrant-client',
        'meilisearch',
        'langchain',
        'langchain-huggingface',
        'spacy',
        'https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl'
    ],
    timeout = 30
)

2024-11-14 10:53:30,764	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_898f3e281f14cab8.zip.
2024-11-14 10:53:30,766	INFO packaging.py:530 -- Creating a file package for local directory '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/ray_step_2'.


Ray batch job id: raysubmit_kURFDiULrFYiEbq3
status: PENDING
status: PENDING
status: PENDING
status: PENDING
status: PENDING
status: PENDING
Ray batch job ended:
Ray batch job failed



## GPU Preprocess

In [30]:
repository_owner = 'K123AsJ0k1'
repository_name = 'cloud-hpc-oss-mlops-platform'
object_bucket = 'llm-rag'

In [72]:
ray_client = setup_ray(
    services = {
        'ray-dashboard': '127.0.0.1:8265'
    },
    timeout = 10
)

In [73]:
process_parameters = {
    'actor-number': 1,
    'embedding-model': 'sentence-transformers/all-MiniLM-L6-v2',
    'keyword-model': 'en_core_web_sm',
    'worker-number': 6,
}

In [74]:
storage_parameters = {
    'object-bucket': object_bucket,
    'object-path': repository_name,
    'vector-collection-prefix': 'embeddings',
    'search-collection-prefix': 'keywords',
    'vector-identity-prefix': 'vector-identities',
    'vector-hash-prefix': 'vector-hashes',
    'search-identity-prefix': 'search-identities',
    'mongo-username': 'mongo123',
    'mongo-password': 'mongo456',
    'mongo-address': 'mongodb-service.llm-pro.svc.cluster.local',
    'mongo-port': '27017',
    'minio-endpoint': 'mlflow-minio-service.mlflow.svc.cluster.local:9000',
    'minio-username': 'minioadmin',
    'minio-password': 'minioadmin',
    'qdrant-key': 'qdrant_key',
    'qdrant-address': 'qdrant-service.llm-pro.svc.cluster.local', 
    'qdrant-port': '7201',
    'meili-key': 'meili_key', 
    'meili-host': 'http://meili-service.llm-pro.svc.cluster.local:7202'
}

In [75]:
data_parameters = {
    'repository-owner': repository_owner,
    'repository-name': repository_name,
    'document-type-priority': {
        'md': 1,
        'py': 2,
        'ipynb': 3,
        'yaml': 4
    },
    'vector-collection-print': 6,
    'search-collection-print': 6,
    'markdown': {
        'chunk-size': 50,
        'chunk-overlap': 0,
    },
    'python': {
        'chunk-size': 50,
        'chunk-overlap': 0,
    },
    'yaml': {
        'chunk-size': 50,
        'chunk-overlap': 0,
    }
}

In [76]:
preprocess_parameters = {
    'process-parameters': process_parameters,
    'storage-parameters': storage_parameters,
    'data-parameters': data_parameters
}

In [78]:
job_status = ray_job_handler(
    ray_client = ray_client,
    ray_parameters = preprocess_parameters,
    ray_job_file = 'preprocess-rag-data.py',
    ray_directory = '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/cpu_gpu_preprocess_step',
    ray_job_envs = {},
    ray_job_packages = [
        'pymongo',
        'minio',
        'qdrant-client',
        'meilisearch',
        'langchain',
        'langchain-huggingface',
        'spacy[cuda122]',
        'https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl'
    ],
    timeout = 5
)

2024-11-18 15:34:36,530	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_ea61199e51e0297d.zip.
2024-11-18 15:34:36,531	INFO packaging.py:530 -- Creating a file package for local directory '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/cpu_gpu_preprocess_step'.


Ray batch job id: raysubmit_hHjgt8c3pqtAgieq
status: PENDING
Ray batch job ended:
Ray batch job failed
2024-11-18 05:34:36,635	INFO job_manager.py:529 -- Runtime env is setting up.
Starting ray job
Python version is:3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:27:36) [GCC 11.2.0]
Ray version is:2.38.0
PyMongo version is:4.10.1
Qdrant version is:1.12.1
Meilisearch version is:0.31.6
Langchain version is:0.3.7
Langchain huggingface version is:0.1.2
Spacy version is:3.7.5
Running preprocess
Creating mongo client
Mongo client created
Creating minio client
Minio client created
Creating qdrant client
Qdrant client created
Creating 1 generator actors
2024-11-18 05:34:41,165	INFO worker.py:1491 -- Using address 10.244.0.83:6379 set in the environment variable RAY_ADDRESS
2024-11-18 05:34:41,165	INFO worker.py:1631 -- Connecting to existing Ray cluster at address: 10.244.0.83:6379...
2024-11-18 05:34:41,190	INFO worker.py:1807 -- Connected to Ray cluster. View the dashboard at [

## Batch GPU

In [18]:
repository_owner = 'K123AsJ0k1'
repository_name = 'cloud-hpc-oss-mlops-platform'
object_bucket = 'llm-rag'

In [19]:
ray_client = setup_ray(
    services = {
        'ray-dashboard': '127.0.0.1:8265'
    },
    timeout = 10
)

In [20]:
process_parameters = {
    'actor-number': 1,
    'embedding-model': 'sentence-transformers/all-MiniLM-L6-v2',
    'keyword-model': 'en_core_web_sm',
    'worker-number': 6,
}

In [21]:
storage_parameters = {
    'object-bucket': object_bucket,
    'object-path': repository_name,
    'vector-collection-prefix': 'embeddings',
    'search-collection-prefix': 'keywords',
    'vector-identity-prefix': 'vector-identities',
    'search-identity-prefix': 'search-identities',
    'mongo-username': 'mongo123',
    'mongo-password': 'mongo456',
    'mongo-address': 'mongodb-service.llm-pro.svc.cluster.local',
    'mongo-port': '27017',
    'minio-endpoint': 'mlflow-minio-service.mlflow.svc.cluster.local:9000',
    'minio-username': 'minioadmin',
    'minio-password': 'minioadmin',
    'qdrant-key': 'qdrant_key',
    'qdrant-address': 'qdrant-service.llm-pro.svc.cluster.local', 
    'qdrant-port': '7201',
    'meili-key': 'meili_key', 
    'meili-host': 'http://meili-service.llm-pro.svc.cluster.local:7202'
}

In [22]:
data_parameters = {
    'repository-owner': repository_owner,
    'repository-name': repository_name,
    'document-type-priority': {
        'md': 1,
        'py': 2,
        'ipynb': 3,
        'yaml': 4
    },
    'vector-collection-print': 10,
    'embedding-length': 384,
    'embedding-batch-size': 20,
    'markdown': {
        'chunk-size': 50,
        'chunk-overlap': 0,
    },
    'python': {
        'chunk-size': 50,
        'chunk-overlap': 0,
    },
    'yaml': {
        'chunk-size': 50,
        'chunk-overlap': 0,
    },
    'points-batch-size': 40,
    'search-collection-print': 10,
    'keyword-batch-size': 20
}

In [23]:
preprocess_parameters = {
    'process-parameters': process_parameters,
    'storage-parameters': storage_parameters,
    'data-parameters': data_parameters
}

In [27]:
job_status = ray_job_handler(
    ray_client = ray_client,
    ray_parameters = preprocess_parameters,
    ray_job_file = 'preprocess-data.py',
    ray_directory = '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/batch_cpu_gpu_preprocess_step',
    ray_job_envs = {},
    ray_job_packages = [
        'pymongo',
        'minio',
        'qdrant-client',
        'meilisearch',
        'langchain',
        'langchain-huggingface',
        'spacy[cuda122]',
        'https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl'
    ],
    timeout = 5
)

2024-11-20 15:41:51,414	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_336c2a26bf83d3eb.zip.
2024-11-20 15:41:51,421	INFO packaging.py:530 -- Creating a file package for local directory '/home/sfniila/Project/cloud-hpc-oss-mlops-platform/applications/development/LLMs/pipeline/preprocessing/batch_cpu_gpu_preprocess_step'.


Ray batch job id: raysubmit_V7MkbeMrrV4iyTeX
status: PENDING
Ray batch job ended:
Ray batch job failed
2024-11-20 05:41:51,529	INFO job_manager.py:529 -- Runtime env is setting up.
Starting ray job
Python version is:3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:27:36) [GCC 11.2.0]
Ray version is:2.38.0
PyMongo version is:4.10.1
Qdrant version is:1.12.1
Meilisearch version is:0.31.6
Langchain version is:0.3.7
Langchain huggingface version is:0.1.2
Spacy version is:3.7.5
Running preprocess
Creating mongo client
Mongo client created
Creating minio client
Minio client created
Creating qdrant client
Qdrant client created
Creating 1 generator actors
2024-11-20 05:41:56,159	INFO worker.py:1491 -- Using address 10.244.0.71:6379 set in the environment variable RAY_ADDRESS
2024-11-20 05:41:56,159	INFO worker.py:1631 -- Connecting to existing Ray cluster at address: 10.244.0.71:6379...
2024-11-20 05:41:56,174	INFO worker.py:1807 -- Connected to Ray cluster. View the dashboard at [

In [65]:
ray_client.stop_job('raysubmit_XewC5d51N1N2Qg1M')

True

In [3]:
def batch_list(
    target: any, 
    size: int
):
    return [target[i:i + size] for i in range(0, len(target), size)]

In [5]:
example_list = [1,2,3,4,5]

In [12]:
batches = batch_list(
    target = example_list, 
    size = 3
)

In [13]:
batches

[[1, 2, 3], [4, 5]]

In [None]:
'github-token': github_token,
'repository-owner': repository_owner,
'repository-name': repository_name,

In [26]:
from github import Github

def pygithub_get_repo_paths(
    token: str,
    owner: str, 
    name: str
) -> any:
    g = Github(token)
    repo = g.get_repo(f"{owner}/{name}")
    contents = repo.get_contents("")
    paths = []
    print(len(contents))
    #while len(contents) > 0:
    #  file_content = contents.pop(0)
    #  if file_content.type == 'dir':
    #    contents.extend(repo.get_contents(file_content.path))
    #  else:
    #    paths.append(file_content.path)
    g.close()
    return paths

In [27]:
pygithub_get_repo_paths(
    token = github_token,
    owner = repository_owner, 
    name = repository_name
)

22


[]

In [65]:
from qdrant_client import QdrantClient as qc

def qdrant_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, qc.Connection)
    except Exception as e:
        return False

def qdrant_setup_client(
    api_key: str,
    address: str, 
    port: str
) -> any:
    try:
        qdrant_client = qc(
            host = address,
            port = int(port),
            api_key = api_key,
            https = False
        ) 
        return qdrant_client
    except Exception as e:
        return None

def qdrant_create_collection(
    qdrant_client: any, 
    collection_name: str,
    configuration: any
) -> any:
    try:
        result = qdrant_client.create_collection(
            collection_name = collection_name,
            vectors_config = configuration
        )
        return result
    except Exception as e:
        print(e)
        return None

def qdrant_get_collection(
    qdrant_client: any, 
    collection_name: str
) -> any:
    try:
        collection = qdrant_client.get_collection(
            collection_name = collection_name
        )
        return collection
    except Exception as e:
        return None

def qdrant_collection_number(
    qdrant_client: any, 
    collection_name: str,
    count_filter: any
) -> any:
    try:
        result = qdrant_client.count(
            collection_name = collection_name,
            count_filter = count_filter,
            exact =  True
        )
        return result.count
    except Exception as e:
        print(e)
        return None

def qdrant_list_collections(
    qdrant_client: any
) -> any:
    try:
        collections = qdrant_client.get_collections()
        collection_list = []
        for description in collections.collections:
            collection_list.append(description.name)
        return collection_list
    except Exception as e:
        return []
    
def qdrant_remove_collection(
    qdrant_client: any, 
    collection_name: str
) -> bool:
    try:
        qdrant_client.delete_collection(collection_name)
        return True
    except Exception as e:
        return False

def qdrant_upsert_points(
    qdrant_client: qc, 
    collection_name: str,
    points: any
) -> any:
    try:
        results = qdrant_client.upsert(
            collection_name = collection_name, 
            points = points
        )
        return results
    except Exception as e:
        print(e)
        return None

def qdrant_search_data(
    qdrant_client: qc,  
    collection_name: str,
    scroll_filter: any,
    limit: int,
    offset: any
) -> any:
    try:
        hits = qdrant_client.scroll(
            collection_name = collection_name,
            scroll_filter = scroll_filter,
            limit = limit,
            with_payload = True,
            offset = offset
        )
        return hits
    except Exception as e:
        print(e)
        return []

def qdrant_search_vectors(
    qdrant_client: qc,  
    collection_name: str,
    query_vector: any,
    limit: str
) -> any:
    try:
        hits = qdrant_client.search(
            collection_name = collection_name,
            query_vector = query_vector,
            limit = limit
        )
        return hits
    except Exception as e:
        return []

def qdrant_remove_points(
    qdrant_client: qc,  
    collection_name: str, 
    points_selector: any
) -> bool:
    try:
        results = qdrant_client.delete(
            collection_name = collection_name,
            points_selector = points_selector
        )
        return results
    except Exception as e:
        print(f"Error removing document: {e}")
        return None

In [7]:
qdrant_client = qdrant_setup_client(
    api_key = 'qdrant_key',
    address = '127.0.0.1', 
    port = '7201'
)

In [16]:
example_number = qdrant_collection_number(
    qdrant_client = qdrant_client, 
    collection_name = 'K123AsJ0k1-cloud-hpc-oss-mlops-platform-ipynb-embeddings',
    count_filter = {}
)

In [17]:
example_number

25862

In [79]:
from qdrant_client.models import PointIdsList

def remove_duplicate_vectors(
    qdrant_client: any,
    collection_name: str
):
    print('Cleaning collection ' + str(collection_name))
    collection_number = qdrant_collection_number(
        qdrant_client = qdrant_client, 
        collection_name = collection_name,
        count_filter = {}
    )

    print('Collection vectors: ' + str(collection_number))

    batch_size = 200
    scroll_offset = None

    unique_point_ids = set()
    unique_chunk_hashes = set()
    duplicate_vectors = []
    while True:
        vectors = qdrant_search_data(
            qdrant_client = qdrant_client,  
            collection_name = collection_name,
            scroll_filter = {},
            limit = batch_size,
            offset = scroll_offset
        )
        
        for vector in vectors[0]:
            chunk_hash = vector.payload['chunk_hash']
            vector_id = vector.id
            # Scroll can cause double count
            # so id check is needed
            if not vector_id in unique_point_ids:
                unique_point_ids.add(vector_id)
                if not chunk_hash in unique_chunk_hashes:
                    unique_chunk_hashes.add(chunk_hash)
                else:
                    duplicate_vectors.append(vector_id)

        if len(vectors[0]) < batch_size:
            break

        scroll_offset = vectors[0][-1].id

    print('Found unique vectors: ' + str(len(unique_chunk_hashes)))
    print('Found duplicate vectors: ' + str(len(duplicate_vectors)))
    if 0 < len(duplicate_vectors):
        status = qdrant_remove_points(
            qdrant_client = qdrant_client,  
            collection_name = collection_name, 
            points_selector = PointIdsList(
                points = duplicate_vectors
            )
        ) 

In [80]:
remove_duplicate_vectors(
    qdrant_client = qdrant_client,
    collection_name = 'K123AsJ0k1-cloud-hpc-oss-mlops-platform-yaml-embeddings'
)

Cleaning collection K123AsJ0k1-cloud-hpc-oss-mlops-platform-yaml-embeddings
Collection vectors: 63954
Found unique vectors: 31740
Found duplicate vectors: 32214


In [None]:
example_tuples = [
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|ipynb', '3-1', 3, 10),
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|ipynb', '3-2', 3, 20),
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|md', '1-1', 1, 10),
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|md', '1-2', 1, 5),
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|py', '2-1', 2, 7),
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|py', '2-2', 2, 2),
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml', '4-1', 4, 8),
    ('K123AsJ0k1|cloud-hpc-oss-mlops-platform|yaml', '4-2', 4, 3)
]

In [None]:
def round_robin_division(
    target_list: any, 
    number: int
) -> any:
    lists = [[] for _ in range(number)]
    i = 0
    sorted_list = sorted(target_list, key = lambda x: (x[-2], x[-1]))
    for elem in sorted_list:
        lists[i].append(elem)
        i = (i + 1) % number
    return lists

In [4]:
import meilisearch as ms

def meili_is_client(
    storage_client: any
) -> any:
    try:
        return isinstance(storage_client, ms.Connection)
    except Exception as e:
        print(e)
        return False

def meili_setup_client(
    api_key: str,
    host: str
) -> any:
    try:
        meili_client = ms.Client(
            url = host, 
            api_key = api_key
        )
        return meili_client 
    except Exception as e:
        print(e)
        return None

def meili_get_index( 
    meili_client: any, 
    index_name: str
) -> any:
    try:
        index = meili_client.index(
            uid = index_name
        )
        return index
    except Exception as e:
        print(e)
        return None
    
def meili_check_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        meili_client.get_index(
            uid = index_name
        )
        return True
    except Exception as e:
        print(e)
        return False
    
def meili_remove_index(
    meili_client: any, 
    index_name: str
) -> bool:
    try:
        response = meili_client.index(
            index_name = index_name
        ).delete()
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_list_indexes(
    meili_client: any
) -> bool:
    try:
        names = []
        indexes = meili_client.get_indexes()
        for index in indexes['results']:
            names.append(index.uid)
        return names
    except Exception as e:
        print(e)
        return None

def meili_add_documents(
    meili_client: any, 
    index_name: str, 
    documents: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.add_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_set_filterable(
    meili_client: any, 
    index_name: str, 
    attributes: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.update_filterable_attributes(attributes)
        return response
    except Exception as e:
        print(e)
        return None

def meili_search_documents(
    meili_client: any, 
    index_name: str, 
    query: any, 
    options: any
) -> any:
    try:
        index = meili_get_index(
            meili_client = meili_client,
            index_name = index_name
        )
        response = index.search(
            query,
            options
        )
        return response
    except Exception as e:
        print(e)
        return None
    
def meili_update_documents(
    meili_client, 
    index_name, 
    documents
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.update_documents(
            documents = documents
        )
        return response
    except Exception as e:
        print(e)
        return None

def meili_delete_documents(
    meili_client: any, 
    index_name: str, 
    ids: any
) -> any:
    try:
        index = meili_client.index(
            index_name = index_name
        )
        response = index.delete_documents(
            document_ids = ids
        )
        return response
    except Exception as e:
        print(e)
        return None

In [5]:
meili_client = meili_setup_client(
    host = 'http://127.0.0.1:7202', 
    api_key = 'meili_key'
)

In [6]:
meili_client.index('K123AsJ0k1-cloud-hpc-oss-mlops-platform-ipynb-keywords').delete()

TaskInfo(task_uid=24124, index_uid='K123AsJ0k1-cloud-hpc-oss-mlops-platform-ipynb-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 11, 20, 13, 2, 31, 189034))

In [7]:
meili_client.index('K123AsJ0k1-cloud-hpc-oss-mlops-platform-md-keywords').delete()

TaskInfo(task_uid=24125, index_uid='K123AsJ0k1-cloud-hpc-oss-mlops-platform-md-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 11, 20, 13, 2, 34, 395815))

In [8]:
meili_client.index('K123AsJ0k1-cloud-hpc-oss-mlops-platform-py-keywords').delete()

TaskInfo(task_uid=24126, index_uid='K123AsJ0k1-cloud-hpc-oss-mlops-platform-py-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 11, 20, 13, 2, 37, 48013))

In [9]:
meili_client.index('K123AsJ0k1-cloud-hpc-oss-mlops-platform-yaml-keywords').delete()

TaskInfo(task_uid=24127, index_uid='K123AsJ0k1-cloud-hpc-oss-mlops-platform-yaml-keywords', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2024, 11, 20, 13, 2, 39, 737196))