From a5f042de8c84c1ae4c5226e532221a44c36ba0e3 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 8 May 2024 16:35:06 -0400 Subject: [PATCH 01/10] Replace data-prep-lab etc with data-prep-kit in prep for repo rename. Signed-off-by: David Wood --- .make.defaults | 8 ++++---- .make.versions | 4 ++-- CONTRIBUTING.md | 4 ++-- Makefile | 2 +- README.md | 8 ++++---- data-processing-lib/Makefile | 4 ++-- data-processing-lib/doc/testing-transforms.md | 8 ++++---- data-processing-lib/pyproject.toml | 2 +- .../src/data_processing/utils/__init__.py | 2 +- .../src/data_processing/utils/config.py | 10 +++++----- data-processing-lib/src/data_processing/utils/log.py | 8 ++++---- doc/repo.md | 10 +++++----- kfp/doc/simple_transform_pipeline.md | 2 +- kfp/kfp_ray_components/Dockerfile | 4 ++-- kfp/kfp_ray_components/Makefile | 4 ++-- kfp/kfp_ray_components/cleanupRayComponent.yaml | 2 +- kfp/kfp_ray_components/createRayComponent.yaml | 2 +- kfp/kfp_ray_components/executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- .../executeSubWorkflowComponent.yaml | 2 +- kfp/kfp_ray_components/requirements.txt | 2 +- kfp/kfp_support_lib/Makefile | 4 ++-- kfp/kfp_support_lib/README.md | 4 ++-- kfp/kfp_support_lib/pyproject.toml | 4 ++-- kfp/transform_workflows/.make.transforms_workflows | 2 +- .../code/code_quality/code_quality_wf.py | 4 ++-- kfp/transform_workflows/code/malware/malware_wf.py | 4 ++-- .../code/proglang_select/proglang_select_wf.py | 4 ++-- .../superworkflows/superworkflow_dedups_sample_wf.py | 6 +++--- kfp/transform_workflows/universal/doc_id/doc_id_wf.py | 4 ++-- kfp/transform_workflows/universal/ededup/ededup_wf.py | 4 ++-- kfp/transform_workflows/universal/fdedup/fdedup_wf.py | 4 ++-- kfp/transform_workflows/universal/filter/filter_wf.py | 4 ++-- kfp/transform_workflows/universal/noop/noop_wf.py | 4 ++-- .../universal/tokenization/tokenization_wf.py | 4 ++-- kind/README.md | 4 ++-- mkdocs.yml | 2 +- tools/ingest2parquet/README.md | 2 +- transforms/code/code_quality/requirements.txt | 4 ++-- .../code/code_quality/src/code_quality_transform.py | 2 +- .../code/code_quality/test-data/expected/metadata.json | 4 ++-- transforms/code/malware/requirements.txt | 4 ++-- transforms/code/proglang_select/requirements.txt | 2 +- transforms/universal/doc_id/requirements.txt | 2 +- transforms/universal/ededup/requirements.txt | 2 +- transforms/universal/fdedup/requirements.txt | 2 +- transforms/universal/filter/README.md | 2 +- transforms/universal/filter/requirements.txt | 2 +- transforms/universal/noop/requirements.txt | 2 +- transforms/universal/tokenization/requirements.txt | 2 +- 50 files changed, 93 insertions(+), 93 deletions(-) diff --git a/.make.defaults b/.make.defaults index 76bdb0643..311f6a13a 100644 --- a/.make.defaults +++ b/.make.defaults @@ -37,8 +37,8 @@ DOCKER_FILE?=Dockerfile DOCKER?=docker DOCKER_HOSTNAME?=quay.io DOCKER_NAMESPACE ?= dataprep1 -DOCKER_REGISTRY_USER?=$(DPL_DOCKER_REGISTRY_USER) -DOCKER_REGISTRY_KEY?=$(DPL_DOCKER_REGISTRY_KEY) +DOCKER_REGISTRY_USER?=$(DPK_DOCKER_REGISTRY_USER) +DOCKER_REGISTRY_KEY?=$(DPK_DOCKER_REGISTRY_KEY) DOCKER_REGISTRY_ENDPOINT?=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE) DOCKER_IMAGE?=${DOCKER_REGISTRY_ENDPOINT}/$(DOCKER_NAME):$(DOCKER_IMAGE_VERSION) include $(REPOROOT)/.make.versions @@ -186,7 +186,7 @@ __check_defined = \ @echo Installing source from data processing library for venv source venv/bin/activate; \ pip install pytest; \ - pip uninstall -y data-prep-lab; \ + pip uninstall -y data-prep-kit; \ if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \ extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \ fi; \ @@ -265,7 +265,7 @@ __check_defined = \ $(MAKE) CHECK_RUNNABLE=minio .defaults.check.installed $(MAKE) CHECK_RUNNABLE=mc .defaults.check.installed -MINIO_DIR=/tmp/data-prep-lab +MINIO_DIR=/tmp/data-prep-kit MINIO_ALIAS=local # These are the credentials used by samples. MINIO_ADMIN_USER=localminioaccesskey diff --git a/.make.versions b/.make.versions index 7a52b16c4..ff2eee01c 100644 --- a/.make.versions +++ b/.make.versions @@ -4,8 +4,8 @@ ################################################################################ # Data prep lab wheel version -DPL_LIB_VERSION=0.1.6 -DPL_LIB_KFP_VERSION=0.1.8 +DPK_LIB_VERSION=0.1.6 +DPK_LIB_KFP_VERSION=0.1.8 # Begin transform versions/tags BLOCKLIST_VERSION=0.2.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0d822e48f..f338e457a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -72,8 +72,8 @@ git commit -s Please install Python 3.10 or 3.11, then ``` -git clone git@github.ibm.com:IBM/data-prep-lab.git -cd data-prep-lab +git clone git@github.ibm.com:IBM/data-prep-kit.git +cd data-prep-kit pip install pre-commit pip install twine pre-commit install diff --git a/Makefile b/Makefile index b9629f64f..3ccf78271 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ test:: @$(MAKE) RULE=$@ .recurse lib-release: - @# Help: Publish data-prep-lab $(DPL_LIB_VERSION) and data-prep-lab-kfp $(DPL_LIB_KFP_VERSION) libraries to pypi + @# Help: Publish data-prep-kit $(DPK_LIB_VERSION) and data-prep-kit-kfp $(DPK_LIB_KFP_VERSION) libraries to pypi @$(MAKE) -C data-processing-lib build publish @$(MAKE) -C kfp/kfp_support_lib build publish @echo "" diff --git a/README.md b/README.md index 81cf5bf99..97fe1d295 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@
- - + +
--- @@ -129,8 +129,8 @@ Docker/Podman ### Installation Steps ```shell -git clone git@github.com:IBM/data-prep-lab.git -cd data-prep-lab +git clone git@github.com:IBM/data-prep-kit.git +cd data-prep-kit pip install pre-commit pip install twine pre-commit install diff --git a/data-processing-lib/Makefile b/data-processing-lib/Makefile index 795cc4e6d..51f845cde 100644 --- a/data-processing-lib/Makefile +++ b/data-processing-lib/Makefile @@ -3,7 +3,7 @@ REPOROOT=../ include ../.make.defaults include ../.make.versions -TAG := "v${DPL_LIB_VERSION}" +TAG := "v${DPK_LIB_VERSION}" clean:: @@ -16,7 +16,7 @@ clean:: update-toml:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml - sed -e 's/^version[ ]*=.*/version = "'${DPL_LIB_VERSION}'"/' pyproject.toml > tt.toml + sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml mv tt.toml pyproject.toml setup:: diff --git a/data-processing-lib/doc/testing-transforms.md b/data-processing-lib/doc/testing-transforms.md index d9f64d933..2d001f965 100644 --- a/data-processing-lib/doc/testing-transforms.md +++ b/data-processing-lib/doc/testing-transforms.md @@ -61,14 +61,14 @@ to use different models and perhaps as a result have different results. Once the test class is defined you may run the test from your IDE or from the command line... ```shell -% cd .../data-prep-lab/transforms/universal/noop/src +% cd .../data-prep-kit/transforms/universal/noop/src % make venv % source venv/bin/activate -(venv)% export PYTHONPATH=.../data-prep-lab/transforms/universal/noop/src +(venv)% export PYTHONPATH=.../data-prep-kit/transforms/universal/noop/src (venv)% pytest test/test_noop.py ================================================================================ test session starts ================================================================================ platform darwin -- Python 3.10.11, pytest-8.0.2, pluggy-1.4.0 -rootdir: /Users/dawood/git/data-prep-lab/transforms/universal/noop +rootdir: /Users/dawood/git/data-prep-kit/transforms/universal/noop plugins: cov-4.1.0 collected 2 items @@ -85,7 +85,7 @@ source venv/bin/activate; \ cd test; pytest . ========================================================================================== test session starts ========================================================================================== platform darwin -- Python 3.10.11, pytest-8.0.2, pluggy-1.4.0 -rootdir: /Users/dawood/git/data-prep-lab/transforms/universal/noop/test +rootdir: /Users/dawood/git/data-prep-kit/transforms/universal/noop/test collected 3 items test_noop.py .. [ 66%] diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index da3fcef25..0851a78c4 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "data_prep_lab" +name = "data_prep_kit" version = "0.1.6" requires-python = ">=3.10" description = "Data Preparation Laboratory Library" diff --git a/data-processing-lib/src/data_processing/utils/__init__.py b/data-processing-lib/src/data_processing/utils/__init__.py index f14d4bb8f..c2b1285b2 100644 --- a/data-processing-lib/src/data_processing/utils/__init__.py +++ b/data-processing-lib/src/data_processing/utils/__init__.py @@ -1,5 +1,5 @@ from data_processing.utils.cli_utils import GB, KB, MB, CLIArgumentProvider, str2bool from data_processing.utils.params_utils import ParamsUtils -from data_processing.utils.config import DPLConfig, add_if_missing +from data_processing.utils.config import DPKConfig, add_if_missing from data_processing.utils.log import get_logger from data_processing.utils.transform_utils import TransformUtils, RANDOM_SEED, LOCAL_TO_DISK diff --git a/data-processing-lib/src/data_processing/utils/config.py b/data-processing-lib/src/data_processing/utils/config.py index 5d0cab340..6183fe8e4 100644 --- a/data-processing-lib/src/data_processing/utils/config.py +++ b/data-processing-lib/src/data_processing/utils/config.py @@ -14,7 +14,7 @@ from typing import Any, Union -class DPLConfig: +class DPKConfig: @staticmethod def _get_first_env_var(env_var_list: list[str]) -> Union[str, None]: for var in env_var_list: @@ -25,10 +25,10 @@ def _get_first_env_var(env_var_list: list[str]) -> Union[str, None]: # print(f"Did not find any of the following env vars {env_var_list}") return None - HUGGING_FACE_TOKEN = _get_first_env_var(["DPL_HUGGING_FACE_TOKEN"]) - """ Set from DPL_HUGGING_FACE_TOKEN env var(s) """ - DEFAULT_LOG_LEVEL = os.environ.get("DPL_LOG_LEVEL", "INFO") - """ Set from DPL_LOG_LEVEL env var(s) """ + HUGGING_FACE_TOKEN = _get_first_env_var(["DPK_HUGGING_FACE_TOKEN"]) + """ Set from DPK_HUGGING_FACE_TOKEN env var(s) """ + DEFAULT_LOG_LEVEL = os.environ.get("DPK_LOG_LEVEL", "INFO") + """ Set from DPK_LOG_LEVEL env var(s) """ def add_if_missing(config: dict[str, Any], key: str, dflt: Any): diff --git a/data-processing-lib/src/data_processing/utils/log.py b/data-processing-lib/src/data_processing/utils/log.py index 990491741..54cf3f4d9 100644 --- a/data-processing-lib/src/data_processing/utils/log.py +++ b/data-processing-lib/src/data_processing/utils/log.py @@ -13,16 +13,16 @@ import logging import os -from data_processing.utils import DPLConfig +from data_processing.utils import DPKConfig def get_log_level(name: str = None): if name is None: - level_name = DPLConfig.DEFAULT_LOG_LEVEL + level_name = DPKConfig.DEFAULT_LOG_LEVEL else: name = name.upper() - name = "DPL_" + name + "_LOG_LEVEL" - level_name = os.environ.get(name, DPLConfig.DEFAULT_LOG_LEVEL) + name = "DPK_" + name + "_LOG_LEVEL" + level_name = os.environ.get(name, DPKConfig.DEFAULT_LOG_LEVEL) return level_name diff --git a/doc/repo.md b/doc/repo.md index 8b8d721a0..f953ff46f 100644 --- a/doc/repo.md +++ b/doc/repo.md @@ -48,11 +48,11 @@ Target Description build Create the venv and build the transform image clean Clean up the virtual environment. conventions Check transform project conventions and make recommendations, if needed. -image Create the docker image quay.io/dataprep1/data-prep-lab/noop:0.7 -publish Publish the quay.io/dataprep1/data-prep-lab/noop:0.7 to quay.io container registry +image Create the docker image quay.io/dataprep1/data-prep-kit/noop:0.7 +publish Publish the quay.io/dataprep1/data-prep-kit/noop:0.7 to quay.io container registry setup Do nothing, since nothing to setup by default. test Run both source and image level tests. -test-image Test an quay.io/dataprep1/data-prep-lab/noop:0.7 use test source inside the image. +test-image Test an quay.io/dataprep1/data-prep-kit/noop:0.7 use test source inside the image. test-locals Run the *local*.py files in the src directory test-src Run the transform's tests and any '*local' .py files venv Install the source from the data processing library for python @@ -60,10 +60,10 @@ venv Install the source from the data processing library for pyt Overridable macro values include the following: DOCKER - the name of the docker executable to use. DOCKER=docker DOCKER_FILE - the name of the docker file to use. DOCKER_FILE=Dockerfile -DOCKER_REGISTRY_ENDPOINT - the docker registry location to publish images. DOCKER_REGISTRY_ENDPOINT=quay.io/dataprep1/data-prep-lab +DOCKER_REGISTRY_ENDPOINT - the docker registry location to publish images. DOCKER_REGISTRY_ENDPOINT=quay.io/dataprep1/data-prep-kit DOCKER_HOSTNAME - the name of the docker registry to use. DOCKER_HOSTNAME=quay.io DOCKER_NAMESPACE - the name space to use in the registry. DOCKER_NAMESPACE=dataprep1 -DOCKER_NAME - the name under the name space where images are publishes. DOCKER_NAME=data-prep-lab +DOCKER_NAME - the name under the name space where images are publishes. DOCKER_NAME=data-prep-kit DOCKER_REGISTRY_USER - the docker user to use. DOCKER_REGISTRY_USER=dataprep1 DOCKER_REGISTRY_KEY - the docker user to use. DOCKER_REGISTRY_KEY=secret PYTHON - the python executable to use. PYTHON=python diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 6dd8e15c5..43b13cc51 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -42,7 +42,7 @@ Ray cluster. For each step we have to define a component that will execute them: ```python # components - base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.2" + base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 87e859cd9..3573ca558 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -5,8 +5,8 @@ ARG GIT_COMMIT LABEL build-date=$BUILD_DATE LABEL git-commit=$GIT_COMMIT -LABEL data-prep-lab=${DPL_LIB_VERSION} -LABEL data-prep-lab-kfp=${DPL_LIB_KFP_VERSION} +LABEL data-prep-kit=${DPK_LIB_VERSION} +LABEL data-prep-kit-kfp=${DPK_LIB_KFP_VERSION} # install libraries COPY requirements.txt requirements.txt diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 91b23eff6..50637bb57 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -10,13 +10,13 @@ IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed ' include makeenv DOCKER_FILE=Dockerfile -DOCKER_NAME=data-prep-lab/kfp-data-processing +DOCKER_NAME=data-prep-kit/kfp-data-processing IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_NAME}:${KFP_DOCKER_VERSION} # Create the docker image making sure the preloaded models are available to copy into the image .kfp_comp.image:: Dockerfile requirements.txt $(call check_defined, DOCKER_HOSTNAME) - sed -i.back "s/data-prep-lab-kfp==[0-9].*/data-prep-lab-kfp==${DPL_LIB_KFP_VERSION}/" requirements.txt + sed -i.back "s/data-prep-kit-kfp==[0-9].*/data-prep-kit-kfp==${DPK_LIB_KFP_VERSION}/" requirements.txt @# Help: Build the docker image using the $(DOCKER_FILE) and requirements.txt $(DOCKER) build -t ${IMG} \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ diff --git a/kfp/kfp_ray_components/cleanupRayComponent.yaml b/kfp/kfp_ray_components/cleanupRayComponent.yaml index eb08bdbb4..9c0e55725 100644 --- a/kfp/kfp_ray_components/cleanupRayComponent.yaml +++ b/kfp/kfp_ray_components/cleanupRayComponent.yaml @@ -8,7 +8,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/createRayComponent.yaml b/kfp/kfp_ray_components/createRayComponent.yaml index aa95dcf97..b88ab1aa9 100644 --- a/kfp/kfp_ray_components/createRayComponent.yaml +++ b/kfp/kfp_ray_components/createRayComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index e5fda6bd0..79186e62d 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 5feba97c1..bc16d17df 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index ef8cb33de..b52a98eaf 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -32,7 +32,7 @@ outputs: implementation: container: - image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/requirements.txt b/kfp/kfp_ray_components/requirements.txt index 00f8f4372..96cfc35f0 100644 --- a/kfp/kfp_ray_components/requirements.txt +++ b/kfp/kfp_ray_components/requirements.txt @@ -1,4 +1,4 @@ -data-prep-lab-kfp==0.1.7 +data-prep-kit-kfp==0.1.7 # for fdedup scipy==1.13.0 diff --git a/kfp/kfp_support_lib/Makefile b/kfp/kfp_support_lib/Makefile index 5653619ff..f8b422110 100644 --- a/kfp/kfp_support_lib/Makefile +++ b/kfp/kfp_support_lib/Makefile @@ -24,8 +24,8 @@ clean:: update-toml:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml - sed -i.back 's/^version[ ]*=.*/version = "'${DPL_LIB_KFP_VERSION}'"/' pyproject.toml - sed -i.back 's/data-prep-lab==[0-9].*/data-prep-lab==${DPL_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml + sed -i.back 's/data-prep-kit==[0-9].*/data-prep-kit==${DPK_LIB_VERSION}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP}",/' pyproject.toml build:: update-toml venv diff --git a/kfp/kfp_support_lib/README.md b/kfp/kfp_support_lib/README.md index f206dde55..f42ce9808 100644 --- a/kfp/kfp_support_lib/README.md +++ b/kfp/kfp_support_lib/README.md @@ -17,7 +17,7 @@ It comprises 2 main modules ### Git Simple clone the repo and set up the pre-commit hooks. ```shell -git clone git@github.com:IBM/data-prep-lab.git +git clone git@github.com:IBM/data-prep-kit.git cd kfp/kfp_support_lib pre-commit install ``` @@ -43,7 +43,7 @@ To begin with, establish a Kind cluster and deploy all required components by ex make setup ``` -The next step is to deploy the `data-prep-lab-kfp` package locally within a Python virtual environment. +The next step is to deploy the `data-prep-kit-kfp` package locally within a Python virtual environment. ```bash make build diff --git a/kfp/kfp_support_lib/pyproject.toml b/kfp/kfp_support_lib/pyproject.toml index 31216c7a4..70fbc8024 100644 --- a/kfp/kfp_support_lib/pyproject.toml +++ b/kfp/kfp_support_lib/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "data_prep_lab_kfp" +name = "data_prep_kit_kfp" version = "0.1.8" requires-python = ">=3.10" description = "Data Preparation Laboratory Library. KFP support" @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==1.8.22", "requests", - "data-prep-lab==0.1.6", + "data-prep-kit==0.1.6", ] [build-system] diff --git a/kfp/transform_workflows/.make.transforms_workflows b/kfp/transform_workflows/.make.transforms_workflows index fcc276ed6..e5794cc8d 100644 --- a/kfp/transform_workflows/.make.transforms_workflows +++ b/kfp/transform_workflows/.make.transforms_workflows @@ -49,7 +49,7 @@ ${VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${ $(PYTHON) -m venv ${REPOROOT}/kfp/transform_workflows/venv . ${VENV_ACTIVATE}; \ pip install kfp==${KFP} --extra-index-url https://pypi.org/simple; \ - pip install data_prep_lab_kfp==${DPL_LIB_KFP_VERSION} + pip install data_prep_kit_kfp==${DPK_LIB_KFP_VERSION} .PHONY: .transforms_workflows.upload-pipeline .transforms_workflows.upload-pipeline: diff --git a/kfp/transform_workflows/code/code_quality/code_quality_wf.py b/kfp/transform_workflows/code/code_quality/code_quality_wf.py index 540b66880..8017834d6 100644 --- a/kfp/transform_workflows/code/code_quality/code_quality_wf.py +++ b/kfp/transform_workflows/code/code_quality/code_quality_wf.py @@ -27,10 +27,10 @@ EXEC_SCRIPT_NAME: str = "cq_transform.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-lab/code_quality:0.2.0" +task_image = "quay.io/dataprep1/data-prep-kit/code_quality:0.2.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( diff --git a/kfp/transform_workflows/code/malware/malware_wf.py b/kfp/transform_workflows/code/malware/malware_wf.py index 190ee2916..792a09c37 100644 --- a/kfp/transform_workflows/code/malware/malware_wf.py +++ b/kfp/transform_workflows/code/malware/malware_wf.py @@ -23,10 +23,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "malware_transform.py" -task_image = "quay.io/dataprep1/data-prep-lab/malware:0.3" +task_image = "quay.io/dataprep1/data-prep-kit/malware:0.3" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py b/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py index 4d75afaee..95cc97c7f 100644 --- a/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py +++ b/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py @@ -23,10 +23,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "proglang_select_transform.py" -task_image = "quay.io/dataprep1/data-prep-lab/proglang_select:0.2.0" +task_image = "quay.io/dataprep1/data-prep-kit/proglang_select:0.2.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py b/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py index 0e63b925f..afda11de9 100644 --- a/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py +++ b/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py @@ -10,9 +10,9 @@ run_exact_dedup_op = comp.load_component_from_file("../../kfp_ray_components/executeSubWorkflowComponent.yaml") run_fuzzy_dedup_op = comp.load_component_from_file("../../kfp_ray_components/executeSubWorkflowComponent.yaml") -doc_id_image = "quay.io/dataprep1/data-prep-lab/doc_id:0.2" -ededup_image = "quay.io/dataprep1/data-prep-lab/ededup:0.2.1" -fdedup_image = "quay.io/dataprep1/data-prep-lab/fdedup:0.2.1" +doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.2" +ededup_image = "quay.io/dataprep1/data-prep-kit/ededup:0.2.1" +fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.2.1" # Pipeline to invoke execution on remote resource @dsl.pipeline( diff --git a/kfp/transform_workflows/universal/doc_id/doc_id_wf.py b/kfp/transform_workflows/universal/doc_id/doc_id_wf.py index 0b5a781a8..3a1ff4b0d 100644 --- a/kfp/transform_workflows/universal/doc_id/doc_id_wf.py +++ b/kfp/transform_workflows/universal/doc_id/doc_id_wf.py @@ -20,13 +20,13 @@ ) -task_image = "quay.io/dataprep1/data-prep-lab/doc_id:0.2" +task_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.2" # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/universal/ededup/ededup_wf.py b/kfp/transform_workflows/universal/ededup/ededup_wf.py index 7b8afd331..781870d81 100644 --- a/kfp/transform_workflows/universal/ededup/ededup_wf.py +++ b/kfp/transform_workflows/universal/ededup/ededup_wf.py @@ -24,10 +24,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "ededup_transform.py" -task_image = "quay.io/dataprep1/data-prep-lab/ededup:0.2.1" +task_image = "quay.io/dataprep1/data-prep-kit/ededup:0.2.1" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters compute_exec_params_op = comp.func_to_container_op(func=ededup_compute_execution_params, base_image=base_kfp_image) diff --git a/kfp/transform_workflows/universal/fdedup/fdedup_wf.py b/kfp/transform_workflows/universal/fdedup/fdedup_wf.py index c7678abb4..d2a8d836c 100644 --- a/kfp/transform_workflows/universal/fdedup/fdedup_wf.py +++ b/kfp/transform_workflows/universal/fdedup/fdedup_wf.py @@ -24,10 +24,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "fdedup_transform.py" -task_image = "quay.io/dataprep1/data-prep-lab/fdedup:0.2.1" +task_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.2.1" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters compute_exec_params_op = comp.func_to_container_op(func=fdedup_compute_execution_params, base_image=base_kfp_image) diff --git a/kfp/transform_workflows/universal/filter/filter_wf.py b/kfp/transform_workflows/universal/filter/filter_wf.py index d648c4892..6bc41e54a 100644 --- a/kfp/transform_workflows/universal/filter/filter_wf.py +++ b/kfp/transform_workflows/universal/filter/filter_wf.py @@ -26,10 +26,10 @@ EXEC_SCRIPT_NAME: str = "filter_transform.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-lab/filter:0.2.0" +task_image = "quay.io/dataprep1/data-prep-kit/filter:0.2.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( diff --git a/kfp/transform_workflows/universal/noop/noop_wf.py b/kfp/transform_workflows/universal/noop/noop_wf.py index 38509fbfd..555bec547 100644 --- a/kfp/transform_workflows/universal/noop/noop_wf.py +++ b/kfp/transform_workflows/universal/noop/noop_wf.py @@ -20,13 +20,13 @@ ) -task_image = "quay.io/dataprep1/data-prep-lab/noop:0.7" +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.7" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/universal/tokenization/tokenization_wf.py b/kfp/transform_workflows/universal/tokenization/tokenization_wf.py index 8a2ff9e57..678a26349 100644 --- a/kfp/transform_workflows/universal/tokenization/tokenization_wf.py +++ b/kfp/transform_workflows/universal/tokenization/tokenization_wf.py @@ -23,10 +23,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "tokenization_transform.py" -task_image = "quay.io/dataprep1/data-prep-lab/tokenization:0.2.0" +task_image = "quay.io/dataprep1/data-prep-kit/tokenization:0.2.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" # compute execution parameters. Use default one for now. compute_exec_params_op = comp.func_to_container_op( func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image diff --git a/kind/README.md b/kind/README.md index 3ff722361..4eaa31722 100644 --- a/kind/README.md +++ b/kind/README.md @@ -41,8 +41,8 @@ Run the following command to create the cluster: ```shell cd /tmp -git clone git@github.com:IBM/data-prep-lab.git -cd data-prep-lab +git clone git@github.com:IBM/data-prep-kit.git +cd data-prep-kit ROOT_DIR=$PWD/kind/ kind create cluster --name dataprep --config ${ROOT_DIR}/hack/kind-cluster-config.yaml ``` diff --git a/mkdocs.yml b/mkdocs.yml index bd6ded781..0c9b10105 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,7 +1,7 @@ site_name: "Data Prep LAB" docs_dir: . site_dir: ../site -repo_url: https://github.com/IBM/data-prep-lab +repo_url: https://github.com/IBM/data-prep-kit nav: - Home: README.md - Overview: data-processing-lib/doc/overview.md diff --git a/tools/ingest2parquet/README.md b/tools/ingest2parquet/README.md index 31ed6603f..6372de999 100644 --- a/tools/ingest2parquet/README.md +++ b/tools/ingest2parquet/README.md @@ -175,7 +175,7 @@ The metadata.json file contains following essential information regarding the pr ## Run using docker image ``` -docker run -it -v $(pwd)/test-data/input:/test-data/input -v $(pwd)/test-data/output:/test-data/output quay.io/dataprep1/data-prep-lab/ingest2parquet:0.1 sh -c "python ingest2parquet.py \ +docker run -it -v $(pwd)/test-data/input:/test-data/input -v $(pwd)/test-data/output:/test-data/output quay.io/dataprep1/data-prep-kit/ingest2parquet:0.1 sh -c "python ingest2parquet.py \ --detect_programming_lang True \ --snapshot github \ --domain code \ diff --git a/transforms/code/code_quality/requirements.txt b/transforms/code/code_quality/requirements.txt index e7eda1990..6917d5cd3 100644 --- a/transforms/code/code_quality/requirements.txt +++ b/transforms/code/code_quality/requirements.txt @@ -1,5 +1,5 @@ # transform runtime -#data-prep-lab==0.1.2 -# Transform requirements except data-prep-lab (see above) +#data-prep-kit==0.1.2 +# Transform requirements except data-prep-kit (see above) bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/src/code_quality_transform.py b/transforms/code/code_quality/src/code_quality_transform.py index 50823e2c7..1c5667f8e 100644 --- a/transforms/code/code_quality/src/code_quality_transform.py +++ b/transforms/code/code_quality/src/code_quality_transform.py @@ -13,7 +13,7 @@ # BigCode Dataset https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing # # Code specific heuristics like alpha numeric, char token ratio implementations & others are taken from CodeParrot and BigCode Dataset -# preprocessing scripts and modified according to data-prep-lab specific framework. +# preprocessing scripts and modified according to data-prep-kit specific framework. ################################################################################ diff --git a/transforms/code/code_quality/test-data/expected/metadata.json b/transforms/code/code_quality/test-data/expected/metadata.json index 448b88cf1..35a0b58a5 100644 --- a/transforms/code/code_quality/test-data/expected/metadata.json +++ b/transforms/code/code_quality/test-data/expected/metadata.json @@ -45,11 +45,11 @@ "table_processing": 0.1933128833770752 }, "source": { - "name": "/root/codellm/repos/external/forks/data-prep-lab/transforms/code/code_quality/test-data/input", + "name": "/root/codellm/repos/external/forks/data-prep-kit/transforms/code/code_quality/test-data/input", "type": "path" }, "target": { - "name": "/root/codellm/repos/external/forks/data-prep-lab/transforms/code/code_quality/output", + "name": "/root/codellm/repos/external/forks/data-prep-kit/transforms/code/code_quality/output", "type": "path" } } diff --git a/transforms/code/malware/requirements.txt b/transforms/code/malware/requirements.txt index d697dfe95..b2ef5599b 100644 --- a/transforms/code/malware/requirements.txt +++ b/transforms/code/malware/requirements.txt @@ -1,4 +1,4 @@ # transform runtime -#data-prep-lab==0.1.2 -# Transform requirements except data-prep-lab (see above) +#data-prep-kit==0.1.2 +# Transform requirements except data-prep-kit (see above) clamd==1.0.2 diff --git a/transforms/code/proglang_select/requirements.txt b/transforms/code/proglang_select/requirements.txt index 91eb4fdcb..4caaecdee 100644 --- a/transforms/code/proglang_select/requirements.txt +++ b/transforms/code/proglang_select/requirements.txt @@ -1,2 +1,2 @@ # transform runtime -#data-prep-lab==0.1.2 +#data-prep-kit==0.1.2 diff --git a/transforms/universal/doc_id/requirements.txt b/transforms/universal/doc_id/requirements.txt index 91eb4fdcb..4caaecdee 100644 --- a/transforms/universal/doc_id/requirements.txt +++ b/transforms/universal/doc_id/requirements.txt @@ -1,2 +1,2 @@ # transform runtime -#data-prep-lab==0.1.2 +#data-prep-kit==0.1.2 diff --git a/transforms/universal/ededup/requirements.txt b/transforms/universal/ededup/requirements.txt index 5c9b18810..85c6ad8a4 100644 --- a/transforms/universal/ededup/requirements.txt +++ b/transforms/universal/ededup/requirements.txt @@ -1,5 +1,5 @@ # transform runtime -#data-prep-lab==0.1.2 +#data-prep-kit==0.1.2 # ededup mmh3 xxhash diff --git a/transforms/universal/fdedup/requirements.txt b/transforms/universal/fdedup/requirements.txt index b905d4585..69e0f27d9 100644 --- a/transforms/universal/fdedup/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -1,5 +1,5 @@ # transform runtime -#data-prep-lab==0.1.2 +#data-prep-kit==0.1.2 # fdedup mmh3 xxhash diff --git a/transforms/universal/filter/README.md b/transforms/universal/filter/README.md index 8c486ad98..55f73b045 100644 --- a/transforms/universal/filter/README.md +++ b/transforms/universal/filter/README.md @@ -180,7 +180,7 @@ You can run the [filter_local.py](src/filter_local.py) (python-only implementati ``` (venv) cma:src$ python filter_local_ray.py 12:48:01 INFO - Running locally -12:48:01 INFO - Using local configuration with: input_folder - /home/cma/de/data-prep-lab/transforms/universal/filtering/test-data/input output_folder - /home/cma/de/data-prep-lab/transforms/universal/filtering/output +12:48:01 INFO - Using local configuration with: input_folder - /home/cma/de/data-prep-kit/transforms/universal/filtering/test-data/input output_folder - /home/cma/de/data-prep-kit/transforms/universal/filtering/output 12:48:01 INFO - Not using data sets, checkpointing False, max files -1 12:48:01 INFO - number of workers 5 worker options {'num_cpus': 0.8} 12:48:01 INFO - pipeline id pipeline_id; number workers 5 diff --git a/transforms/universal/filter/requirements.txt b/transforms/universal/filter/requirements.txt index 7fa7f8588..e0566390e 100644 --- a/transforms/universal/filter/requirements.txt +++ b/transforms/universal/filter/requirements.txt @@ -1,4 +1,4 @@ # transform runtime -#data-prep-lab==0.1.2 +#data-prep-kit==0.1.2 # filter duckdb==0.10.1 diff --git a/transforms/universal/noop/requirements.txt b/transforms/universal/noop/requirements.txt index 81df832ec..240029f53 100644 --- a/transforms/universal/noop/requirements.txt +++ b/transforms/universal/noop/requirements.txt @@ -1,2 +1,2 @@ # transform runtime -#data-prep-lab==0.1.2 \ No newline at end of file +#data-prep-kit==0.1.2 \ No newline at end of file diff --git a/transforms/universal/tokenization/requirements.txt b/transforms/universal/tokenization/requirements.txt index 0a86ba56d..6e36f32fb 100644 --- a/transforms/universal/tokenization/requirements.txt +++ b/transforms/universal/tokenization/requirements.txt @@ -1,4 +1,4 @@ # transform runtime -#data-prep-lab==0.1.2 +#data-prep-kit==0.1.2 # for downloading + loading tokenizers: transformers==4.38.0 From 98536a0b9a8196e4118942250c7aaa91fc53f69b Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 8 May 2024 16:39:37 -0400 Subject: [PATCH 02/10] reset library and image version numbers to accommodate rename Signed-off-by: David Wood --- .make.versions | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/.make.versions b/.make.versions index ff2eee01c..2c79a9684 100644 --- a/.make.versions +++ b/.make.versions @@ -4,24 +4,24 @@ ################################################################################ # Data prep lab wheel version -DPK_LIB_VERSION=0.1.6 -DPK_LIB_KFP_VERSION=0.1.8 +DPK_LIB_VERSION=0.0.1 +DPK_LIB_KFP_VERSION=0.0.1 # Begin transform versions/tags -BLOCKLIST_VERSION=0.2.0 -DOC_ID_VERSION=0.2 -EDEDUP_VERSION=0.2.1 -FDEDUP_VERSION=0.2.1 -FILTER_VERSION=0.2.0 -NOOP_VERSION=0.7 -RESIZE_VERSION=0.2 -LANG_ID_VERSION=0.2 -TOKENIZER_VERSION=0.2.0 -MALWARE_VERSION=0.3 -PROGLANG_SELECT_VERSION=0.2.0 -CODE_QUALITY_VERSION=0.2.0 -DOC_QUALITY_VERSION=0.2.0 -INGEST_TO_PARQUET_VERSION=0.1.0 +BLOCKLIST_VERSION=0.3.0 +DOC_ID_VERSION=0.3.0 +EDEDUP_VERSION=0.3.0 +FDEDUP_VERSION=0.3.0 +FILTER_VERSION=0.3.0 +NOOP_VERSION=0.8.0 +RESIZE_VERSION=0.3.0 +LANG_ID_VERSION=0.3.0 +TOKENIZER_VERSION=0.3.0 +MALWARE_VERSION=0.4.0 +PROGLANG_SELECT_VERSION=0.3.0 +CODE_QUALITY_VERSION=0.3.0 +DOC_QUALITY_VERSION=0.3.0 +INGEST_TO_PARQUET_VERSION=0.3.0 -KFP_DOCKER_VERSION=0.0.8 +KFP_DOCKER_VERSION=0.1.0 From 97f7cbb81da18bb93650a9fb05a147decef95770 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 8 May 2024 18:50:44 -0400 Subject: [PATCH 03/10] Use data-prep-toolkit instead of data-prep-kit as pypi package Signed-off-by: David Wood --- README.md | 12 ++++++------ data-processing-lib/pyproject.toml | 6 +++--- kfp/kfp_ray_components/Makefile | 2 +- kfp/kfp_ray_components/requirements.txt | 2 +- kfp/kfp_support_lib/Makefile | 2 +- kfp/kfp_support_lib/pyproject.toml | 8 ++++---- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 97fe1d295..43d476956 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -

Data Prep Lab

+

Data Prep Kit

@@ -11,7 +11,7 @@ --- -Data Prep Lab is a community project to democratize and accelerate unstructured data preparation for LLM app developers. +Data Prep Kit is a community project to democratize and accelerate unstructured data preparation for LLM app developers. With the explosive growth of LLM-enabled use cases, developers are faced with the enormous challenge of preparing use case-specific unstructured data to fine-tune or instruct-tune the LLMs. As the variety of use cases grows, so does the need to support: @@ -19,7 +19,7 @@ As the variety of use cases grows, so does the need to support: - New ways of transforming the data to optimize the performance of the resulting LLMs for each specific use case. - Large variety in the scale of data to be processed, from laptop-scale to datacenter-scale -Data Prep Lab offers implementations of commonly needed data transformations, called *modules*, for both Code and Language modalities. +Data Prep Kit offers implementations of commonly needed data transformations, called *modules*, for both Code and Language modalities. The goal is to offer high-level APIs for developers to quickly get started in working with their data, without needing expertise in the underlying runtimes and frameworks. ## 📝 Table of Contents @@ -30,12 +30,12 @@ The goal is to offer high-level APIs for developers to quickly get started in wo - [Acknowledgments](#acknowledgement) ## 📖 About -Data Prep Lab is a toolkit for streamlining data preparation for developers looking to build LLM-enabled applications via fine-tuning or instruction-tuning. -Data Prep Lab contributes a set of modules that the developer can get started with to easily build data pipelines suitable for their use case. +Data Prep Kit is a toolkit for streamlining data preparation for developers looking to build LLM-enabled applications via fine-tuning or instruction-tuning. +Data Prep Kit contributes a set of modules that the developer can get started with to easily build data pipelines suitable for their use case. These modules have been tested in producing pre-training datasets for the [Granite](https://huggingface.co/instructlab/granite-7b-lab) open models. The modules are built on common frameworks (for Spark and Ray), called the *data processing library* that allows the developers to build new custom modules that readily scale across a variety of runtimes. -Eventually, Data Prep Lab will offer consistent APIs and configurations across the following underlying runtimes. +Eventually, Data Prep Kit will offer consistent APIs and configurations across the following underlying runtimes. 1. Python runtime 2. Ray runtime (local and distributed) diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 0851a78c4..ea520e881 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,8 +1,8 @@ [project] -name = "data_prep_kit" -version = "0.1.6" +name = "data_prep_toolkit" +version = "0.0.1" requires-python = ">=3.10" -description = "Data Preparation Laboratory Library" +description = "Data Preparation Toolkit Library" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 50637bb57..55baaa7c6 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -16,7 +16,7 @@ IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_NAME}:${KFP_DOCKER_VERSION} # Create the docker image making sure the preloaded models are available to copy into the image .kfp_comp.image:: Dockerfile requirements.txt $(call check_defined, DOCKER_HOSTNAME) - sed -i.back "s/data-prep-kit-kfp==[0-9].*/data-prep-kit-kfp==${DPK_LIB_KFP_VERSION}/" requirements.txt + sed -i.back "s/data-prep-toolkit-kfp==[0-9].*/data-prep-toolkit-kfp==${DPK_LIB_KFP_VERSION}/" requirements.txt @# Help: Build the docker image using the $(DOCKER_FILE) and requirements.txt $(DOCKER) build -t ${IMG} \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ diff --git a/kfp/kfp_ray_components/requirements.txt b/kfp/kfp_ray_components/requirements.txt index 96cfc35f0..4bfd495d0 100644 --- a/kfp/kfp_ray_components/requirements.txt +++ b/kfp/kfp_ray_components/requirements.txt @@ -1,4 +1,4 @@ -data-prep-kit-kfp==0.1.7 +data-prep-toolkit-kfp==0.1.7 # for fdedup scipy==1.13.0 diff --git a/kfp/kfp_support_lib/Makefile b/kfp/kfp_support_lib/Makefile index f8b422110..8e5590237 100644 --- a/kfp/kfp_support_lib/Makefile +++ b/kfp/kfp_support_lib/Makefile @@ -25,7 +25,7 @@ clean:: update-toml:: .check-env @# Help: Copy the Makefile distribution version into the pyproject.toml sed -i.back 's/^version[ ]*=.*/version = "'${DPK_LIB_KFP_VERSION}'"/' pyproject.toml - sed -i.back 's/data-prep-kit==[0-9].*/data-prep-kit==${DPK_LIB_VERSION}",/' pyproject.toml + sed -i.back 's/data-prep-toolkit==[0-9].*/data-prep-toolkit==${DPK_LIB_VERSION}",/' pyproject.toml sed -i.back 's/kfp==[0-9].*/kfp==${KFP}",/' pyproject.toml build:: update-toml venv diff --git a/kfp/kfp_support_lib/pyproject.toml b/kfp/kfp_support_lib/pyproject.toml index 70fbc8024..771b30dc4 100644 --- a/kfp/kfp_support_lib/pyproject.toml +++ b/kfp/kfp_support_lib/pyproject.toml @@ -1,6 +1,6 @@ [project] -name = "data_prep_kit_kfp" -version = "0.1.8" +name = "data_prep_toolkit_kfp" +version = "0.0.1" requires-python = ">=3.10" description = "Data Preparation Laboratory Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==1.8.22", "requests", - "data-prep-kit==0.1.6", + "data-prep-toolkit==0.0.1", ] [build-system] @@ -43,4 +43,4 @@ addopts = "--cov --cov-report term-missing --cov-fail-under 10" markers = ["unit: unit tests", "integration: integration tests"] [tool.coverage.run] -include = ["src/*"] \ No newline at end of file +include = ["src/*"] From e2c6b9336e6c29be1f3b1e5bf365ccf0331e3076 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 8 May 2024 19:38:11 -0400 Subject: [PATCH 04/10] fix version of data-prep-toolkit-kfp in requirements.txt Signed-off-by: David Wood --- kfp/kfp_ray_components/requirements.txt | 2 +- kfp/kfp_support_lib/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kfp/kfp_ray_components/requirements.txt b/kfp/kfp_ray_components/requirements.txt index 4bfd495d0..76d970e68 100644 --- a/kfp/kfp_ray_components/requirements.txt +++ b/kfp/kfp_ray_components/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit-kfp==0.1.7 +data-prep-toolkit-kfp==0.0.1 # for fdedup scipy==1.13.0 diff --git a/kfp/kfp_support_lib/pyproject.toml b/kfp/kfp_support_lib/pyproject.toml index 771b30dc4..7f4637a50 100644 --- a/kfp/kfp_support_lib/pyproject.toml +++ b/kfp/kfp_support_lib/pyproject.toml @@ -2,7 +2,7 @@ name = "data_prep_toolkit_kfp" version = "0.0.1" requires-python = ">=3.10" -description = "Data Preparation Laboratory Library. KFP support" +description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ From c3dab82d34c0899b6fb8ccdc79024fc6324a2404 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 8 May 2024 19:46:20 -0400 Subject: [PATCH 05/10] Fix docker file wheel reference to data-prep-[tool]kit Signed-off-by: David Wood --- kfp/kfp_ray_components/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 3573ca558..9aa9a3ef6 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -5,8 +5,8 @@ ARG GIT_COMMIT LABEL build-date=$BUILD_DATE LABEL git-commit=$GIT_COMMIT -LABEL data-prep-kit=${DPK_LIB_VERSION} -LABEL data-prep-kit-kfp=${DPK_LIB_KFP_VERSION} +LABEL data-prep-toolkit=${DPK_LIB_VERSION} +LABEL data-prep-toolkit-kfp=${DPK_LIB_KFP_VERSION} # install libraries COPY requirements.txt requirements.txt From 9024c132412b71fa8d8fdc6bb67fb9606eb9b301 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 8 May 2024 20:10:44 -0400 Subject: [PATCH 06/10] fix .make.transform_workflows to use data-prep-toolkit Signed-off-by: David Wood --- kfp/transform_workflows/.make.transforms_workflows | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kfp/transform_workflows/.make.transforms_workflows b/kfp/transform_workflows/.make.transforms_workflows index e5794cc8d..2bc3593ad 100644 --- a/kfp/transform_workflows/.make.transforms_workflows +++ b/kfp/transform_workflows/.make.transforms_workflows @@ -49,7 +49,7 @@ ${VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/requirements.env ${ $(PYTHON) -m venv ${REPOROOT}/kfp/transform_workflows/venv . ${VENV_ACTIVATE}; \ pip install kfp==${KFP} --extra-index-url https://pypi.org/simple; \ - pip install data_prep_kit_kfp==${DPK_LIB_KFP_VERSION} + pip install data_prep_toolkit_kfp==${DPK_LIB_KFP_VERSION} .PHONY: .transforms_workflows.upload-pipeline .transforms_workflows.upload-pipeline: From 070959873a8c6d00a4cc1d91f9d97e2d1e741b24 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 8 May 2024 20:12:13 -0400 Subject: [PATCH 07/10] have .make.defaults uninstall data-prep-toolkit Signed-off-by: David Wood --- .make.defaults | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.make.defaults b/.make.defaults index 311f6a13a..4ab77489f 100644 --- a/.make.defaults +++ b/.make.defaults @@ -186,7 +186,7 @@ __check_defined = \ @echo Installing source from data processing library for venv source venv/bin/activate; \ pip install pytest; \ - pip uninstall -y data-prep-kit; \ + pip uninstall -y data-prep-toolkit; \ if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \ extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \ fi; \ From 360a1acadde2a6ffb2406ba48142779b104b81ae Mon Sep 17 00:00:00 2001 From: David Wood Date: Thu, 9 May 2024 07:44:33 -0400 Subject: [PATCH 08/10] Update kfp docker image versions in yamls Signed-off-by: David Wood --- kfp/kfp_ray_components/Makefile | 9 ++++++--- kfp/kfp_ray_components/cleanupRayComponent.yaml | 2 +- kfp/kfp_ray_components/createRayComponent.yaml | 2 +- kfp/kfp_ray_components/executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- kfp/kfp_ray_components/executeSubWorkflowComponent.yaml | 2 +- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 55baaa7c6..7d139502c 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -15,17 +15,18 @@ IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_NAME}:${KFP_DOCKER_VERSION} # Create the docker image making sure the preloaded models are available to copy into the image .kfp_comp.image:: Dockerfile requirements.txt + @# Help: Build the docker image using the $(DOCKER_FILE) and requirements.txt $(call check_defined, DOCKER_HOSTNAME) sed -i.back "s/data-prep-toolkit-kfp==[0-9].*/data-prep-toolkit-kfp==${DPK_LIB_KFP_VERSION}/" requirements.txt - @# Help: Build the docker image using the $(DOCKER_FILE) and requirements.txt $(DOCKER) build -t ${IMG} \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . --no-cache image:: .kfp_comp.image - make reconcile-requirements + $(MAKE) reconcile-requirements reconcile-requirements:: + @# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION) sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" createRayComponent.yaml sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" cleanupRayComponent.yaml @@ -33,16 +34,18 @@ reconcile-requirements:: sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml load-image: - @# Help: Load the image to the kind cluster created with make setup. + @# Help: Load the image to the kind cluster created with make setup. kind load docker-image $(IMG) --name=$(KIND_CLUSTER_NAME) build:: image publish:: + @# Help: Push $(IMG) to the registry $(DOCKER) push ${IMG} test:: clean:: + @# Help: Remove $(IMG) $(DOCKER) image rm ${IMG} || true -rm makeenv diff --git a/kfp/kfp_ray_components/cleanupRayComponent.yaml b/kfp/kfp_ray_components/cleanupRayComponent.yaml index 9c0e55725..396de6ab9 100644 --- a/kfp/kfp_ray_components/cleanupRayComponent.yaml +++ b/kfp/kfp_ray_components/cleanupRayComponent.yaml @@ -8,7 +8,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/createRayComponent.yaml b/kfp/kfp_ray_components/createRayComponent.yaml index b88ab1aa9..243dc9019 100644 --- a/kfp/kfp_ray_components/createRayComponent.yaml +++ b/kfp/kfp_ray_components/createRayComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index 79186e62d..2d1e77200 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index bc16d17df..f716595ce 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index b52a98eaf..98e2981c3 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -32,7 +32,7 @@ outputs: implementation: container: - image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8 + image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0 # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. From 00c08bf88ba7f7b37261be7805eedde07c646d81 Mon Sep 17 00:00:00 2001 From: David Wood Date: Thu, 9 May 2024 08:04:55 -0400 Subject: [PATCH 09/10] Update image versions in all _wf.py files Signed-off-by: David Wood --- kfp/transform_workflows/.make.transforms_workflows | 2 +- .../code/code_quality/code_quality_wf.py | 4 ++-- kfp/transform_workflows/code/malware/malware_wf.py | 4 ++-- .../code/proglang_select/proglang_select_wf.py | 4 ++-- .../superworkflows/superworkflow_dedups_sample_wf.py | 6 +++--- kfp/transform_workflows/universal/doc_id/doc_id_wf.py | 4 ++-- kfp/transform_workflows/universal/ededup/ededup_wf.py | 4 ++-- kfp/transform_workflows/universal/fdedup/fdedup_wf.py | 4 ++-- kfp/transform_workflows/universal/filter/filter_wf.py | 4 ++-- kfp/transform_workflows/universal/noop/noop_wf.py | 4 ++-- .../universal/tokenization/tokenization_wf.py | 2 +- 11 files changed, 21 insertions(+), 21 deletions(-) diff --git a/kfp/transform_workflows/.make.transforms_workflows b/kfp/transform_workflows/.make.transforms_workflows index 2bc3593ad..62dbc7cff 100644 --- a/kfp/transform_workflows/.make.transforms_workflows +++ b/kfp/transform_workflows/.make.transforms_workflows @@ -18,7 +18,7 @@ endef [[ $$line == *#* ]] && continue; \ export DOCKER_IMAGE_NAME=$$(echo $$line |cut -d "=" -f 1 |sed "s/_VERSION//" |tr '[:upper:]' '[:lower:]'); \ export DOCKER_IMAGE_VERSION=$$(echo $$line |cut -d "=" -f 2); \ - sed -i.back "s/data\-prep\-lab\/$$DOCKER_IMAGE_NAME:.*/data\-prep\-lab\/$$DOCKER_IMAGE_NAME:$$DOCKER_IMAGE_VERSION\"/" $$PIPELINE_FILE ;\ + sed -i.back "s/data-prep-kit\/$$DOCKER_IMAGE_NAME:.*/data-prep-kit\/$$DOCKER_IMAGE_NAME:$$DOCKER_IMAGE_VERSION\"/" $$PIPELINE_FILE ;\ done < ${REPOROOT}/.make.versions sed -i.back "s/kfp-data-processing:.*/kfp-data-processing:${KFP_DOCKER_VERSION}\"/" ${PIPELINE_FILE} diff --git a/kfp/transform_workflows/code/code_quality/code_quality_wf.py b/kfp/transform_workflows/code/code_quality/code_quality_wf.py index 8017834d6..5500f8398 100644 --- a/kfp/transform_workflows/code/code_quality/code_quality_wf.py +++ b/kfp/transform_workflows/code/code_quality/code_quality_wf.py @@ -27,10 +27,10 @@ EXEC_SCRIPT_NAME: str = "cq_transform.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-kit/code_quality:0.2.0" +task_image = "quay.io/dataprep1/data-prep-kit/code_quality:0.3.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( diff --git a/kfp/transform_workflows/code/malware/malware_wf.py b/kfp/transform_workflows/code/malware/malware_wf.py index 792a09c37..4f6cae3c4 100644 --- a/kfp/transform_workflows/code/malware/malware_wf.py +++ b/kfp/transform_workflows/code/malware/malware_wf.py @@ -23,10 +23,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "malware_transform.py" -task_image = "quay.io/dataprep1/data-prep-kit/malware:0.3" +task_image = "quay.io/dataprep1/data-prep-kit/malware:0.4.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py b/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py index 95cc97c7f..daec419bf 100644 --- a/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py +++ b/kfp/transform_workflows/code/proglang_select/proglang_select_wf.py @@ -23,10 +23,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "proglang_select_transform.py" -task_image = "quay.io/dataprep1/data-prep-kit/proglang_select:0.2.0" +task_image = "quay.io/dataprep1/data-prep-kit/proglang_select:0.3.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py b/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py index afda11de9..53612b348 100644 --- a/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py +++ b/kfp/transform_workflows/superworkflows/superworkflow_dedups_sample_wf.py @@ -10,9 +10,9 @@ run_exact_dedup_op = comp.load_component_from_file("../../kfp_ray_components/executeSubWorkflowComponent.yaml") run_fuzzy_dedup_op = comp.load_component_from_file("../../kfp_ray_components/executeSubWorkflowComponent.yaml") -doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.2" -ededup_image = "quay.io/dataprep1/data-prep-kit/ededup:0.2.1" -fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.2.1" +doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.3.0" +ededup_image = "quay.io/dataprep1/data-prep-kit/ededup:0.3.0" +fdedup_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.3.0" # Pipeline to invoke execution on remote resource @dsl.pipeline( diff --git a/kfp/transform_workflows/universal/doc_id/doc_id_wf.py b/kfp/transform_workflows/universal/doc_id/doc_id_wf.py index 3a1ff4b0d..bf11c34d4 100644 --- a/kfp/transform_workflows/universal/doc_id/doc_id_wf.py +++ b/kfp/transform_workflows/universal/doc_id/doc_id_wf.py @@ -20,13 +20,13 @@ ) -task_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.2" +task_image = "quay.io/dataprep1/data-prep-kit/doc_id:0.3.0" # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/universal/ededup/ededup_wf.py b/kfp/transform_workflows/universal/ededup/ededup_wf.py index 781870d81..44b753d41 100644 --- a/kfp/transform_workflows/universal/ededup/ededup_wf.py +++ b/kfp/transform_workflows/universal/ededup/ededup_wf.py @@ -24,10 +24,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "ededup_transform.py" -task_image = "quay.io/dataprep1/data-prep-kit/ededup:0.2.1" +task_image = "quay.io/dataprep1/data-prep-kit/ededup:0.3.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters compute_exec_params_op = comp.func_to_container_op(func=ededup_compute_execution_params, base_image=base_kfp_image) diff --git a/kfp/transform_workflows/universal/fdedup/fdedup_wf.py b/kfp/transform_workflows/universal/fdedup/fdedup_wf.py index d2a8d836c..5abb4f56d 100644 --- a/kfp/transform_workflows/universal/fdedup/fdedup_wf.py +++ b/kfp/transform_workflows/universal/fdedup/fdedup_wf.py @@ -24,10 +24,10 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "fdedup_transform.py" -task_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.2.1" +task_image = "quay.io/dataprep1/data-prep-kit/fdedup:0.3.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters compute_exec_params_op = comp.func_to_container_op(func=fdedup_compute_execution_params, base_image=base_kfp_image) diff --git a/kfp/transform_workflows/universal/filter/filter_wf.py b/kfp/transform_workflows/universal/filter/filter_wf.py index 6bc41e54a..bb4de690a 100644 --- a/kfp/transform_workflows/universal/filter/filter_wf.py +++ b/kfp/transform_workflows/universal/filter/filter_wf.py @@ -26,10 +26,10 @@ EXEC_SCRIPT_NAME: str = "filter_transform.py" PREFIX: str = "" -task_image = "quay.io/dataprep1/data-prep-kit/filter:0.2.0" +task_image = "quay.io/dataprep1/data-prep-kit/filter:0.3.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters. Here different tranforms might need different implementations. As # a result, insted of creating a component we are creating it in place here. compute_exec_params_op = comp.func_to_container_op( diff --git a/kfp/transform_workflows/universal/noop/noop_wf.py b/kfp/transform_workflows/universal/noop/noop_wf.py index 555bec547..c3a3d2a84 100644 --- a/kfp/transform_workflows/universal/noop/noop_wf.py +++ b/kfp/transform_workflows/universal/noop/noop_wf.py @@ -20,13 +20,13 @@ ) -task_image = "quay.io/dataprep1/data-prep-kit/noop:0.7" +task_image = "quay.io/dataprep1/data-prep-kit/noop:0.8.0" # the name of the job script EXEC_SCRIPT_NAME: str = "noop_transform.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters. Here different tranforms might need different implementations. As # a result, instead of creating a component we are creating it in place here. diff --git a/kfp/transform_workflows/universal/tokenization/tokenization_wf.py b/kfp/transform_workflows/universal/tokenization/tokenization_wf.py index 678a26349..5d8922120 100644 --- a/kfp/transform_workflows/universal/tokenization/tokenization_wf.py +++ b/kfp/transform_workflows/universal/tokenization/tokenization_wf.py @@ -26,7 +26,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/tokenization:0.2.0" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.8" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0" # compute execution parameters. Use default one for now. compute_exec_params_op = comp.func_to_container_op( func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image From cb005d18cdc5f38d7f90876a73ff2384c860d5ce Mon Sep 17 00:00:00 2001 From: David Wood Date: Thu, 9 May 2024 09:38:23 -0400 Subject: [PATCH 10/10] add data-prep-kit to top level DOCKER_NAMESPACE Signed-off-by: David Wood --- .make.defaults | 4 ++-- kfp/kfp_ray_components/Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.make.defaults b/.make.defaults index 4ab77489f..6d0cafeee 100644 --- a/.make.defaults +++ b/.make.defaults @@ -36,7 +36,7 @@ DOCKER_FILE?=Dockerfile #DOCKER_NAME?=xyzzy # Must be defined by the includeing makefile DOCKER?=docker DOCKER_HOSTNAME?=quay.io -DOCKER_NAMESPACE ?= dataprep1 +DOCKER_NAMESPACE ?= dataprep1/data-prep-kit DOCKER_REGISTRY_USER?=$(DPK_DOCKER_REGISTRY_USER) DOCKER_REGISTRY_KEY?=$(DPK_DOCKER_REGISTRY_KEY) DOCKER_REGISTRY_ENDPOINT?=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE) @@ -231,7 +231,7 @@ __check_defined = \ .PHONY: .defaults.publish .defaults.publish:: @# Help: Publish the $(DOCKER_IMAGE) to $(DOCKER_HOSTNAME) container registry - $(DOCKER) logout $(DOCKER_HOSTNAME) + -$(DOCKER) logout $(DOCKER_HOSTNAME) $(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' $(DOCKER) push $(DOCKER_IMAGE) diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile index 7d139502c..956e994a4 100644 --- a/kfp/kfp_ray_components/Makefile +++ b/kfp/kfp_ray_components/Makefile @@ -10,7 +10,7 @@ IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed ' include makeenv DOCKER_FILE=Dockerfile -DOCKER_NAME=data-prep-kit/kfp-data-processing +DOCKER_NAME=kfp-data-processing IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_NAME}:${KFP_DOCKER_VERSION} # Create the docker image making sure the preloaded models are available to copy into the image