IBM · daw3rd · May 9, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/.make.defaults b/.make.defaults
@@ -36,9 +36,9 @@ DOCKER_FILE?=Dockerfile
 #DOCKER_NAME?=xyzzy	# Must be defined by the includeing makefile
 DOCKER?=docker
 DOCKER_HOSTNAME?=quay.io
-DOCKER_NAMESPACE ?= dataprep1
-DOCKER_REGISTRY_USER?=$(DPL_DOCKER_REGISTRY_USER)
-DOCKER_REGISTRY_KEY?=$(DPL_DOCKER_REGISTRY_KEY)
+DOCKER_NAMESPACE ?= dataprep1/data-prep-kit
+DOCKER_REGISTRY_USER?=$(DPK_DOCKER_REGISTRY_USER)
+DOCKER_REGISTRY_KEY?=$(DPK_DOCKER_REGISTRY_KEY)
 DOCKER_REGISTRY_ENDPOINT?=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)
 DOCKER_IMAGE?=${DOCKER_REGISTRY_ENDPOINT}/$(DOCKER_NAME):$(DOCKER_IMAGE_VERSION)
 include $(REPOROOT)/.make.versions
@@ -186,7 +186,7 @@ __check_defined = \
 	@echo Installing source from data processing library for venv
 	source venv/bin/activate;				      	\
 	pip install pytest;						\
-	pip uninstall -y data-prep-lab;		 			\
+	pip uninstall -y data-prep-toolkit;		 			\
 	if [ ! -z "$(EXTRA_INDEX_URL)" ]; then				\
 		extra_url='--extra-index-url $(EXTRA_INDEX_URL)';	\
 	fi;								\
@@ -231,7 +231,7 @@ __check_defined = \
 .PHONY: .defaults.publish
 .defaults.publish:: 
 	@# Help: Publish the $(DOCKER_IMAGE) to $(DOCKER_HOSTNAME) container registry
-	$(DOCKER) logout $(DOCKER_HOSTNAME)
+	-$(DOCKER) logout $(DOCKER_HOSTNAME)
 	$(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
 	$(DOCKER) push  $(DOCKER_IMAGE)
 
@@ -265,7 +265,7 @@ __check_defined = \
 	$(MAKE) CHECK_RUNNABLE=minio .defaults.check.installed
 	$(MAKE) CHECK_RUNNABLE=mc .defaults.check.installed
 
-MINIO_DIR=/tmp/data-prep-lab
+MINIO_DIR=/tmp/data-prep-kit
 MINIO_ALIAS=local
 # These are the credentials used by samples.
 MINIO_ADMIN_USER=localminioaccesskey

diff --git a/.make.versions b/.make.versions
@@ -4,24 +4,24 @@
 ################################################################################
 
 # Data prep lab wheel version
-DPL_LIB_VERSION=0.1.6
-DPL_LIB_KFP_VERSION=0.1.8
+DPK_LIB_VERSION=0.0.1
+DPK_LIB_KFP_VERSION=0.0.1
 
 # Begin transform versions/tags
-BLOCKLIST_VERSION=0.2.0
-DOC_ID_VERSION=0.2
-EDEDUP_VERSION=0.2.1
-FDEDUP_VERSION=0.2.1
-FILTER_VERSION=0.2.0
-NOOP_VERSION=0.7
-RESIZE_VERSION=0.2
-LANG_ID_VERSION=0.2
-TOKENIZER_VERSION=0.2.0
-MALWARE_VERSION=0.3
-PROGLANG_SELECT_VERSION=0.2.0
-CODE_QUALITY_VERSION=0.2.0
-DOC_QUALITY_VERSION=0.2.0
-INGEST_TO_PARQUET_VERSION=0.1.0
+BLOCKLIST_VERSION=0.3.0
+DOC_ID_VERSION=0.3.0
+EDEDUP_VERSION=0.3.0
+FDEDUP_VERSION=0.3.0
+FILTER_VERSION=0.3.0
+NOOP_VERSION=0.8.0
+RESIZE_VERSION=0.3.0
+LANG_ID_VERSION=0.3.0
+TOKENIZER_VERSION=0.3.0
+MALWARE_VERSION=0.4.0
+PROGLANG_SELECT_VERSION=0.3.0
+CODE_QUALITY_VERSION=0.3.0
+DOC_QUALITY_VERSION=0.3.0
+INGEST_TO_PARQUET_VERSION=0.3.0
 
-KFP_DOCKER_VERSION=0.0.8
+KFP_DOCKER_VERSION=0.1.0
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -72,8 +72,8 @@ git commit -s
 Please install Python 3.10 or 3.11, then 
 
 ```
-git clone git@github.ibm.com:IBM/data-prep-lab.git
-cd data-prep-lab 
+git clone git@github.ibm.com:IBM/data-prep-kit.git
+cd data-prep-kit 
 pip install pre-commit
 pip install twine
 pre-commit install

diff --git a/Makefile b/Makefile
@@ -44,7 +44,7 @@ test::
 	@$(MAKE) RULE=$@ .recurse
 
 lib-release:
-	@# Help: Publish data-prep-lab $(DPL_LIB_VERSION) and data-prep-lab-kfp $(DPL_LIB_KFP_VERSION) libraries to pypi 
+	@# Help: Publish data-prep-kit $(DPK_LIB_VERSION) and data-prep-kit-kfp $(DPK_LIB_KFP_VERSION) libraries to pypi 
 	@$(MAKE) -C data-processing-lib build publish
 	@$(MAKE) -C kfp/kfp_support_lib build publish
 	@echo ""

diff --git a/README.md b/README.md
@@ -1,25 +1,25 @@
 
 
-<h1 align="center">Data Prep Lab </h1>
+<h1 align="center">Data Prep Kit</h1>
 
 <div align="center"> 
 
 <?  [![Status](https://img.shields.io/badge/status-active-success.svg)]() ?>
-<?  [![GitHub Issues](https://img.shields.io/github/issues/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-lab/issues) ?>
-<?  [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-lab/pulls) ?>
+<?  [![GitHub Issues](https://img.shields.io/github/issues/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-kit/issues) ?>
+<?  [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-kit/pulls) ?>
 </div> 
 
 ---
 
-Data Prep Lab is a community project to democratize and accelerate unstructured data preparation for LLM app developers. 
+Data Prep Kit is a community project to democratize and accelerate unstructured data preparation for LLM app developers. 
 With the explosive growth of LLM-enabled use cases, developers are faced with the enormous challenge of preparing use case-specific unstructured data to fine-tune or instruct-tune the LLMs.
 As the variety of use cases grows, so does the need to support:
 
 - New modalities of data (code, language, speech, visual) 
 - New ways of transforming the data to optimize the performance of the resulting LLMs for each specific use case.
 - Large variety in the scale of data to be processed, from laptop-scale to datacenter-scale
 
-Data Prep Lab offers implementations of commonly needed data transformations, called *modules*, for both Code and Language modalities.
+Data Prep Kit offers implementations of commonly needed data transformations, called *modules*, for both Code and Language modalities.
 The goal is to offer high-level APIs for developers to quickly get started in working with their data, without needing expertise in the underlying runtimes and frameworks.
 
 ## 📝 Table of Contents
@@ -30,12 +30,12 @@ The goal is to offer high-level APIs for developers to quickly get started in wo
 - [Acknowledgments](#acknowledgement)
 
 ## &#x1F4D6; About <a name = "about"></a>
-Data Prep Lab is a toolkit for streamlining data preparation for developers looking to build LLM-enabled applications via fine-tuning or instruction-tuning.
-Data Prep Lab contributes a set of modules that the developer can get started with to easily build data pipelines suitable for their use case.
+Data Prep Kit is a toolkit for streamlining data preparation for developers looking to build LLM-enabled applications via fine-tuning or instruction-tuning.
+Data Prep Kit contributes a set of modules that the developer can get started with to easily build data pipelines suitable for their use case.
 These modules have been tested in producing pre-training datasets for the [Granite](https://huggingface.co/instructlab/granite-7b-lab) open models. 
 
 The modules are built on common frameworks (for Spark and Ray), called the *data processing library* that allows the developers to build new custom modules that readily scale across a variety of runtimes.
-Eventually, Data Prep Lab will offer consistent APIs and configurations across the following underlying runtimes.
+Eventually, Data Prep Kit will offer consistent APIs and configurations across the following underlying runtimes.
 
 1. Python runtime
 2. Ray runtime (local and distributed)
@@ -129,8 +129,8 @@ Docker/Podman
 ### Installation Steps
 
 ```shell
-git clone git@github.com:IBM/data-prep-lab.git
-cd data-prep-lab
+git clone git@github.com:IBM/data-prep-kit.git
+cd data-prep-kit
 pip install pre-commit
 pip install twine
 pre-commit install

diff --git a/data-processing-lib/Makefile b/data-processing-lib/Makefile
@@ -3,7 +3,7 @@ REPOROOT=../
 include ../.make.defaults
 include ../.make.versions
 
-TAG := "v${DPL_LIB_VERSION}"
+TAG := "v${DPK_LIB_VERSION}"
 
 
 clean::
@@ -16,7 +16,7 @@ clean::
 
 update-toml:: .check-env
 	@# Help: Copy the Makefile distribution version into the pyproject.toml 
-	sed -e 's/^version[ ]*=.*/version = "'${DPL_LIB_VERSION}'"/' pyproject.toml > tt.toml
+	sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
 	mv tt.toml pyproject.toml
 
 setup::

diff --git a/data-processing-lib/doc/testing-transforms.md b/data-processing-lib/doc/testing-transforms.md
@@ -61,14 +61,14 @@ to use different models and perhaps as a result have different results.
 
 Once the test class is defined you may run the test from your IDE or from the command line... 
 ```shell
-% cd .../data-prep-lab/transforms/universal/noop/src
+% cd .../data-prep-kit/transforms/universal/noop/src
 % make venv
 % source venv/bin/activate
-(venv)% export PYTHONPATH=.../data-prep-lab/transforms/universal/noop/src
+(venv)% export PYTHONPATH=.../data-prep-kit/transforms/universal/noop/src
 (venv)% pytest test/test_noop.py 
 ================================================================================ test session starts ================================================================================
 platform darwin -- Python 3.10.11, pytest-8.0.2, pluggy-1.4.0
-rootdir: /Users/dawood/git/data-prep-lab/transforms/universal/noop
+rootdir: /Users/dawood/git/data-prep-kit/transforms/universal/noop
 plugins: cov-4.1.0
 collected 2 items                                                                                                                                                                   
 
@@ -85,7 +85,7 @@ source venv/bin/activate;			\
 	cd test; pytest . 
 ========================================================================================== test session starts ==========================================================================================
 platform darwin -- Python 3.10.11, pytest-8.0.2, pluggy-1.4.0
-rootdir: /Users/dawood/git/data-prep-lab/transforms/universal/noop/test
+rootdir: /Users/dawood/git/data-prep-kit/transforms/universal/noop/test
 collected 3 items                                                                                                                                                                                       
 
 test_noop.py ..                                                                                                                                                                                   [ 66%]

diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml
@@ -1,8 +1,8 @@
 [project]
-name = "data_prep_lab"
-version = "0.1.6"
+name = "data_prep_toolkit"
+version = "0.0.1"
 requires-python = ">=3.10"
-description = "Data Preparation Laboratory Library"
+description = "Data Preparation Toolkit Library"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [

diff --git a/data-processing-lib/src/data_processing/utils/__init__.py b/data-processing-lib/src/data_processing/utils/__init__.py
@@ -1,5 +1,5 @@
 from data_processing.utils.cli_utils import GB, KB, MB, CLIArgumentProvider, str2bool
 from data_processing.utils.params_utils import ParamsUtils
-from data_processing.utils.config import DPLConfig, add_if_missing
+from data_processing.utils.config import DPKConfig, add_if_missing
 from data_processing.utils.log import get_logger
 from data_processing.utils.transform_utils import TransformUtils, RANDOM_SEED, LOCAL_TO_DISK
diff --git a/data-processing-lib/src/data_processing/utils/config.py b/data-processing-lib/src/data_processing/utils/config.py
@@ -14,7 +14,7 @@
 from typing import Any, Union
 
 
-class DPLConfig:
+class DPKConfig:
     @staticmethod
     def _get_first_env_var(env_var_list: list[str]) -> Union[str, None]:
         for var in env_var_list:
@@ -25,10 +25,10 @@ def _get_first_env_var(env_var_list: list[str]) -> Union[str, None]:
         # print(f"Did not find any of the following env vars {env_var_list}")
         return None
 
-    HUGGING_FACE_TOKEN = _get_first_env_var(["DPL_HUGGING_FACE_TOKEN"])
-    """ Set from DPL_HUGGING_FACE_TOKEN env var(s) """
-    DEFAULT_LOG_LEVEL = os.environ.get("DPL_LOG_LEVEL", "INFO")
-    """ Set from DPL_LOG_LEVEL env var(s) """
+    HUGGING_FACE_TOKEN = _get_first_env_var(["DPK_HUGGING_FACE_TOKEN"])
+    """ Set from DPK_HUGGING_FACE_TOKEN env var(s) """
+    DEFAULT_LOG_LEVEL = os.environ.get("DPK_LOG_LEVEL", "INFO")
+    """ Set from DPK_LOG_LEVEL env var(s) """
 
 
 def add_if_missing(config: dict[str, Any], key: str, dflt: Any):

diff --git a/data-processing-lib/src/data_processing/utils/log.py b/data-processing-lib/src/data_processing/utils/log.py
@@ -13,16 +13,16 @@
 import logging
 import os
 
-from data_processing.utils import DPLConfig
+from data_processing.utils import DPKConfig
 
 
 def get_log_level(name: str = None):
     if name is None:
-        level_name = DPLConfig.DEFAULT_LOG_LEVEL
+        level_name = DPKConfig.DEFAULT_LOG_LEVEL
     else:
         name = name.upper()
-        name = "DPL_" + name + "_LOG_LEVEL"
-        level_name = os.environ.get(name, DPLConfig.DEFAULT_LOG_LEVEL)
+        name = "DPK_" + name + "_LOG_LEVEL"
+        level_name = os.environ.get(name, DPKConfig.DEFAULT_LOG_LEVEL)
     return level_name
 
 

diff --git a/doc/repo.md b/doc/repo.md
@@ -48,22 +48,22 @@ Target               Description
 build                Create the venv and build the transform image 
 clean                Clean up the virtual environment.
 conventions          Check transform project conventions and make recommendations, if needed.
-image                Create the docker image quay.io/dataprep1/data-prep-lab/noop:0.7
-publish              Publish the quay.io/dataprep1/data-prep-lab/noop:0.7 to quay.io container registry
+image                Create the docker image quay.io/dataprep1/data-prep-kit/noop:0.7
+publish              Publish the quay.io/dataprep1/data-prep-kit/noop:0.7 to quay.io container registry
 setup                Do nothing, since nothing to setup by default. 
 test                 Run both source and image level tests.
-test-image           Test an quay.io/dataprep1/data-prep-lab/noop:0.7 use test source inside the image. 
+test-image           Test an quay.io/dataprep1/data-prep-kit/noop:0.7 use test source inside the image. 
 test-locals          Run the *local*.py files in the src directory 
 test-src             Run the transform's tests and any '*local' .py files
 venv                 Install the source from the data processing library for python
 
 Overridable macro values include the following:
 DOCKER - the name of the docker executable to use. DOCKER=docker
 DOCKER_FILE - the name of the docker file to use. DOCKER_FILE=Dockerfile
-DOCKER_REGISTRY_ENDPOINT - the docker registry location to publish images. DOCKER_REGISTRY_ENDPOINT=quay.io/dataprep1/data-prep-lab
+DOCKER_REGISTRY_ENDPOINT - the docker registry location to publish images. DOCKER_REGISTRY_ENDPOINT=quay.io/dataprep1/data-prep-kit
 DOCKER_HOSTNAME - the name of the docker registry to use. DOCKER_HOSTNAME=quay.io
 DOCKER_NAMESPACE - the name space to use in the registry. DOCKER_NAMESPACE=dataprep1
-DOCKER_NAME - the name under the name space where images are publishes. DOCKER_NAME=data-prep-lab
+DOCKER_NAME - the name under the name space where images are publishes. DOCKER_NAME=data-prep-kit
 DOCKER_REGISTRY_USER - the docker user to use. DOCKER_REGISTRY_USER=dataprep1
 DOCKER_REGISTRY_KEY - the docker user to use. DOCKER_REGISTRY_KEY=secret
 PYTHON - the python executable to use. PYTHON=python

diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md
@@ -42,7 +42,7 @@ Ray cluster. For each step we have to define a component that will execute them:
 
 ```python
     # components
-    base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.2"
+    base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2"
     # compute execution parameters. Here different tranforms might need different implementations. As
     # a result, instead of creating a component we are creating it in place here.
     compute_exec_params_op = comp.func_to_container_op(

diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile
@@ -5,8 +5,8 @@ ARG GIT_COMMIT
 
 LABEL build-date=$BUILD_DATE
 LABEL git-commit=$GIT_COMMIT
-LABEL data-prep-lab=${DPL_LIB_VERSION}
-LABEL data-prep-lab-kfp=${DPL_LIB_KFP_VERSION}
+LABEL data-prep-toolkit=${DPK_LIB_VERSION}
+LABEL data-prep-toolkit-kfp=${DPK_LIB_KFP_VERSION}
 
 # install libraries
 COPY requirements.txt requirements.txt

diff --git a/kfp/kfp_ray_components/Makefile b/kfp/kfp_ray_components/Makefile
@@ -10,39 +10,42 @@ IGNORE := $(shell bash -c "sed -n /=/p  ${REPOROOT}/kfp/requirements.env | sed '
 
 include makeenv
 DOCKER_FILE=Dockerfile
-DOCKER_NAME=data-prep-lab/kfp-data-processing
+DOCKER_NAME=kfp-data-processing
 IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_NAME}:${KFP_DOCKER_VERSION}
 
 # Create the docker image making sure the preloaded models are available to copy into the image
 .kfp_comp.image:: Dockerfile requirements.txt
-	$(call check_defined, DOCKER_HOSTNAME)
-	sed -i.back "s/data-prep-lab-kfp==[0-9].*/data-prep-lab-kfp==${DPL_LIB_KFP_VERSION}/" requirements.txt
 	@# Help: Build the docker image using the $(DOCKER_FILE) and requirements.txt 
+	$(call check_defined, DOCKER_HOSTNAME)
+	sed -i.back "s/data-prep-toolkit-kfp==[0-9].*/data-prep-toolkit-kfp==${DPK_LIB_KFP_VERSION}/" requirements.txt
 	$(DOCKER) build -t ${IMG} \
 		--build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ')  \
 		--build-arg GIT_COMMIT=$(shell git log -1 --format=%h)  . --no-cache
 
 image:: .kfp_comp.image
-	make reconcile-requirements
+	$(MAKE) reconcile-requirements
 
 reconcile-requirements::
+	@# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION)
 	sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml
 	sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" createRayComponent.yaml
 	sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" cleanupRayComponent.yaml
 	sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml
 	sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml
 
 load-image:
-	@# Help:  Load the image to the kind cluster created with make setup.
+	@# Help: Load the image to the kind cluster created with make setup.
 	kind load docker-image $(IMG) --name=$(KIND_CLUSTER_NAME)
 
 build:: image
 
 publish::
+	@# Help: Push $(IMG) to the registry 
 	$(DOCKER) push ${IMG}
 
 test::
 
 clean::
+	@# Help: Remove $(IMG) 
 	$(DOCKER) image rm ${IMG} || true
 	-rm  makeenv
diff --git a/kfp/kfp_ray_components/cleanupRayComponent.yaml b/kfp/kfp_ray_components/cleanupRayComponent.yaml
@@ -8,7 +8,7 @@ inputs:
 
 implementation:
     container:
-        image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8
+        image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.

diff --git a/kfp/kfp_ray_components/createRayComponent.yaml b/kfp/kfp_ray_components/createRayComponent.yaml
@@ -11,7 +11,7 @@ inputs:
 
 implementation:
     container:
-        image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8
+        image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.

diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml
@@ -12,7 +12,7 @@ inputs:
 
 implementation:
     container:
-        image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8
+        image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.