Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change name to data prep kit #91

Merged
merged 10 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ DOCKER_FILE?=Dockerfile
#DOCKER_NAME?=xyzzy # Must be defined by the includeing makefile
DOCKER?=docker
DOCKER_HOSTNAME?=quay.io
DOCKER_NAMESPACE ?= dataprep1
DOCKER_REGISTRY_USER?=$(DPL_DOCKER_REGISTRY_USER)
DOCKER_REGISTRY_KEY?=$(DPL_DOCKER_REGISTRY_KEY)
DOCKER_NAMESPACE ?= dataprep1/data-prep-kit
DOCKER_REGISTRY_USER?=$(DPK_DOCKER_REGISTRY_USER)
DOCKER_REGISTRY_KEY?=$(DPK_DOCKER_REGISTRY_KEY)
DOCKER_REGISTRY_ENDPOINT?=$(DOCKER_HOSTNAME)/$(DOCKER_NAMESPACE)
DOCKER_IMAGE?=${DOCKER_REGISTRY_ENDPOINT}/$(DOCKER_NAME):$(DOCKER_IMAGE_VERSION)
include $(REPOROOT)/.make.versions
Expand Down Expand Up @@ -186,7 +186,7 @@ __check_defined = \
@echo Installing source from data processing library for venv
source venv/bin/activate; \
pip install pytest; \
pip uninstall -y data-prep-lab; \
pip uninstall -y data-prep-toolkit; \
if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \
extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \
fi; \
Expand Down Expand Up @@ -231,7 +231,7 @@ __check_defined = \
.PHONY: .defaults.publish
.defaults.publish::
@# Help: Publish the $(DOCKER_IMAGE) to $(DOCKER_HOSTNAME) container registry
$(DOCKER) logout $(DOCKER_HOSTNAME)
-$(DOCKER) logout $(DOCKER_HOSTNAME)
$(DOCKER) login $(DOCKER_HOSTNAME) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
$(DOCKER) push $(DOCKER_IMAGE)

Expand Down Expand Up @@ -265,7 +265,7 @@ __check_defined = \
$(MAKE) CHECK_RUNNABLE=minio .defaults.check.installed
$(MAKE) CHECK_RUNNABLE=mc .defaults.check.installed

MINIO_DIR=/tmp/data-prep-lab
MINIO_DIR=/tmp/data-prep-kit
MINIO_ALIAS=local
# These are the credentials used by samples.
MINIO_ADMIN_USER=localminioaccesskey
Expand Down
34 changes: 17 additions & 17 deletions .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,24 @@
################################################################################

# Data prep lab wheel version
DPL_LIB_VERSION=0.1.6
DPL_LIB_KFP_VERSION=0.1.8
DPK_LIB_VERSION=0.0.1
DPK_LIB_KFP_VERSION=0.0.1

# Begin transform versions/tags
BLOCKLIST_VERSION=0.2.0
DOC_ID_VERSION=0.2
EDEDUP_VERSION=0.2.1
FDEDUP_VERSION=0.2.1
FILTER_VERSION=0.2.0
NOOP_VERSION=0.7
RESIZE_VERSION=0.2
LANG_ID_VERSION=0.2
TOKENIZER_VERSION=0.2.0
MALWARE_VERSION=0.3
PROGLANG_SELECT_VERSION=0.2.0
CODE_QUALITY_VERSION=0.2.0
DOC_QUALITY_VERSION=0.2.0
INGEST_TO_PARQUET_VERSION=0.1.0
BLOCKLIST_VERSION=0.3.0
DOC_ID_VERSION=0.3.0
EDEDUP_VERSION=0.3.0
FDEDUP_VERSION=0.3.0
FILTER_VERSION=0.3.0
NOOP_VERSION=0.8.0
RESIZE_VERSION=0.3.0
LANG_ID_VERSION=0.3.0
TOKENIZER_VERSION=0.3.0
MALWARE_VERSION=0.4.0
PROGLANG_SELECT_VERSION=0.3.0
CODE_QUALITY_VERSION=0.3.0
DOC_QUALITY_VERSION=0.3.0
INGEST_TO_PARQUET_VERSION=0.3.0

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this 0.3.0? Should we align with the library version and make it 0.1.0?

KFP_DOCKER_VERSION=0.0.8
KFP_DOCKER_VERSION=0.1.0

4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ git commit -s
Please install Python 3.10 or 3.11, then

```
git clone git@github.ibm.com:IBM/data-prep-lab.git
cd data-prep-lab
git clone git@github.ibm.com:IBM/data-prep-kit.git
cd data-prep-kit
pip install pre-commit
pip install twine
pre-commit install
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ test::
@$(MAKE) RULE=$@ .recurse

lib-release:
@# Help: Publish data-prep-lab $(DPL_LIB_VERSION) and data-prep-lab-kfp $(DPL_LIB_KFP_VERSION) libraries to pypi
@# Help: Publish data-prep-kit $(DPK_LIB_VERSION) and data-prep-kit-kfp $(DPK_LIB_KFP_VERSION) libraries to pypi
@$(MAKE) -C data-processing-lib build publish
@$(MAKE) -C kfp/kfp_support_lib build publish
@echo ""
Expand Down
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@


<h1 align="center">Data Prep Lab </h1>
<h1 align="center">Data Prep Kit</h1>

<div align="center">

<? [![Status](https://img.shields.io/badge/status-active-success.svg)]() ?>
<? [![GitHub Issues](https://img.shields.io/github/issues/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-lab/issues) ?>
<? [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-lab/pulls) ?>
<? [![GitHub Issues](https://img.shields.io/github/issues/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-kit/issues) ?>
<? [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/kylelobo/The-Documentation-Compendium.svg)](https://github.com/IBM/data-prep-kit/pulls) ?>
</div>

---

Data Prep Lab is a community project to democratize and accelerate unstructured data preparation for LLM app developers.
Data Prep Kit is a community project to democratize and accelerate unstructured data preparation for LLM app developers.
With the explosive growth of LLM-enabled use cases, developers are faced with the enormous challenge of preparing use case-specific unstructured data to fine-tune or instruct-tune the LLMs.
As the variety of use cases grows, so does the need to support:

- New modalities of data (code, language, speech, visual)
- New ways of transforming the data to optimize the performance of the resulting LLMs for each specific use case.
- Large variety in the scale of data to be processed, from laptop-scale to datacenter-scale

Data Prep Lab offers implementations of commonly needed data transformations, called *modules*, for both Code and Language modalities.
Data Prep Kit offers implementations of commonly needed data transformations, called *modules*, for both Code and Language modalities.
The goal is to offer high-level APIs for developers to quickly get started in working with their data, without needing expertise in the underlying runtimes and frameworks.

## 📝 Table of Contents
Expand All @@ -30,12 +30,12 @@ The goal is to offer high-level APIs for developers to quickly get started in wo
- [Acknowledgments](#acknowledgement)

## &#x1F4D6; About <a name = "about"></a>
Data Prep Lab is a toolkit for streamlining data preparation for developers looking to build LLM-enabled applications via fine-tuning or instruction-tuning.
Data Prep Lab contributes a set of modules that the developer can get started with to easily build data pipelines suitable for their use case.
Data Prep Kit is a toolkit for streamlining data preparation for developers looking to build LLM-enabled applications via fine-tuning or instruction-tuning.
Data Prep Kit contributes a set of modules that the developer can get started with to easily build data pipelines suitable for their use case.
These modules have been tested in producing pre-training datasets for the [Granite](https://huggingface.co/instructlab/granite-7b-lab) open models.

The modules are built on common frameworks (for Spark and Ray), called the *data processing library* that allows the developers to build new custom modules that readily scale across a variety of runtimes.
Eventually, Data Prep Lab will offer consistent APIs and configurations across the following underlying runtimes.
Eventually, Data Prep Kit will offer consistent APIs and configurations across the following underlying runtimes.

1. Python runtime
2. Ray runtime (local and distributed)
Expand Down Expand Up @@ -129,8 +129,8 @@ Docker/Podman
### Installation Steps

```shell
git clone git@github.com:IBM/data-prep-lab.git
cd data-prep-lab
git clone git@github.com:IBM/data-prep-kit.git
cd data-prep-kit
pip install pre-commit
pip install twine
pre-commit install
Expand Down
4 changes: 2 additions & 2 deletions data-processing-lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ REPOROOT=../
include ../.make.defaults
include ../.make.versions

TAG := "v${DPL_LIB_VERSION}"
TAG := "v${DPK_LIB_VERSION}"


clean::
Expand All @@ -16,7 +16,7 @@ clean::

update-toml:: .check-env
@# Help: Copy the Makefile distribution version into the pyproject.toml
sed -e 's/^version[ ]*=.*/version = "'${DPL_LIB_VERSION}'"/' pyproject.toml > tt.toml
sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
mv tt.toml pyproject.toml

setup::
Expand Down
8 changes: 4 additions & 4 deletions data-processing-lib/doc/testing-transforms.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ to use different models and perhaps as a result have different results.

Once the test class is defined you may run the test from your IDE or from the command line...
```shell
% cd .../data-prep-lab/transforms/universal/noop/src
% cd .../data-prep-kit/transforms/universal/noop/src
% make venv
% source venv/bin/activate
(venv)% export PYTHONPATH=.../data-prep-lab/transforms/universal/noop/src
(venv)% export PYTHONPATH=.../data-prep-kit/transforms/universal/noop/src
(venv)% pytest test/test_noop.py
================================================================================ test session starts ================================================================================
platform darwin -- Python 3.10.11, pytest-8.0.2, pluggy-1.4.0
rootdir: /Users/dawood/git/data-prep-lab/transforms/universal/noop
rootdir: /Users/dawood/git/data-prep-kit/transforms/universal/noop
plugins: cov-4.1.0
collected 2 items

Expand All @@ -85,7 +85,7 @@ source venv/bin/activate; \
cd test; pytest .
========================================================================================== test session starts ==========================================================================================
platform darwin -- Python 3.10.11, pytest-8.0.2, pluggy-1.4.0
rootdir: /Users/dawood/git/data-prep-lab/transforms/universal/noop/test
rootdir: /Users/dawood/git/data-prep-kit/transforms/universal/noop/test
collected 3 items

test_noop.py .. [ 66%]
Expand Down
6 changes: 3 additions & 3 deletions data-processing-lib/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[project]
name = "data_prep_lab"
version = "0.1.6"
name = "data_prep_toolkit"
version = "0.0.1"
requires-python = ">=3.10"
description = "Data Preparation Laboratory Library"
description = "Data Preparation Toolkit Library"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
Expand Down
2 changes: 1 addition & 1 deletion data-processing-lib/src/data_processing/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from data_processing.utils.cli_utils import GB, KB, MB, CLIArgumentProvider, str2bool
from data_processing.utils.params_utils import ParamsUtils
from data_processing.utils.config import DPLConfig, add_if_missing
from data_processing.utils.config import DPKConfig, add_if_missing
from data_processing.utils.log import get_logger
from data_processing.utils.transform_utils import TransformUtils, RANDOM_SEED, LOCAL_TO_DISK
10 changes: 5 additions & 5 deletions data-processing-lib/src/data_processing/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from typing import Any, Union


class DPLConfig:
class DPKConfig:
@staticmethod
def _get_first_env_var(env_var_list: list[str]) -> Union[str, None]:
for var in env_var_list:
Expand All @@ -25,10 +25,10 @@ def _get_first_env_var(env_var_list: list[str]) -> Union[str, None]:
# print(f"Did not find any of the following env vars {env_var_list}")
return None

HUGGING_FACE_TOKEN = _get_first_env_var(["DPL_HUGGING_FACE_TOKEN"])
""" Set from DPL_HUGGING_FACE_TOKEN env var(s) """
DEFAULT_LOG_LEVEL = os.environ.get("DPL_LOG_LEVEL", "INFO")
""" Set from DPL_LOG_LEVEL env var(s) """
HUGGING_FACE_TOKEN = _get_first_env_var(["DPK_HUGGING_FACE_TOKEN"])
""" Set from DPK_HUGGING_FACE_TOKEN env var(s) """
DEFAULT_LOG_LEVEL = os.environ.get("DPK_LOG_LEVEL", "INFO")
""" Set from DPK_LOG_LEVEL env var(s) """


def add_if_missing(config: dict[str, Any], key: str, dflt: Any):
Expand Down
8 changes: 4 additions & 4 deletions data-processing-lib/src/data_processing/utils/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@
import logging
import os

from data_processing.utils import DPLConfig
from data_processing.utils import DPKConfig


def get_log_level(name: str = None):
if name is None:
level_name = DPLConfig.DEFAULT_LOG_LEVEL
level_name = DPKConfig.DEFAULT_LOG_LEVEL
else:
name = name.upper()
name = "DPL_" + name + "_LOG_LEVEL"
level_name = os.environ.get(name, DPLConfig.DEFAULT_LOG_LEVEL)
name = "DPK_" + name + "_LOG_LEVEL"
level_name = os.environ.get(name, DPKConfig.DEFAULT_LOG_LEVEL)
return level_name


Expand Down
10 changes: 5 additions & 5 deletions doc/repo.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,22 @@ Target Description
build Create the venv and build the transform image
clean Clean up the virtual environment.
conventions Check transform project conventions and make recommendations, if needed.
image Create the docker image quay.io/dataprep1/data-prep-lab/noop:0.7
publish Publish the quay.io/dataprep1/data-prep-lab/noop:0.7 to quay.io container registry
image Create the docker image quay.io/dataprep1/data-prep-kit/noop:0.7
publish Publish the quay.io/dataprep1/data-prep-kit/noop:0.7 to quay.io container registry
setup Do nothing, since nothing to setup by default.
test Run both source and image level tests.
test-image Test an quay.io/dataprep1/data-prep-lab/noop:0.7 use test source inside the image.
test-image Test an quay.io/dataprep1/data-prep-kit/noop:0.7 use test source inside the image.
test-locals Run the *local*.py files in the src directory
test-src Run the transform's tests and any '*local' .py files
venv Install the source from the data processing library for python

Overridable macro values include the following:
DOCKER - the name of the docker executable to use. DOCKER=docker
DOCKER_FILE - the name of the docker file to use. DOCKER_FILE=Dockerfile
DOCKER_REGISTRY_ENDPOINT - the docker registry location to publish images. DOCKER_REGISTRY_ENDPOINT=quay.io/dataprep1/data-prep-lab
DOCKER_REGISTRY_ENDPOINT - the docker registry location to publish images. DOCKER_REGISTRY_ENDPOINT=quay.io/dataprep1/data-prep-kit
DOCKER_HOSTNAME - the name of the docker registry to use. DOCKER_HOSTNAME=quay.io
DOCKER_NAMESPACE - the name space to use in the registry. DOCKER_NAMESPACE=dataprep1
DOCKER_NAME - the name under the name space where images are publishes. DOCKER_NAME=data-prep-lab
DOCKER_NAME - the name under the name space where images are publishes. DOCKER_NAME=data-prep-kit
DOCKER_REGISTRY_USER - the docker user to use. DOCKER_REGISTRY_USER=dataprep1
DOCKER_REGISTRY_KEY - the docker user to use. DOCKER_REGISTRY_KEY=secret
PYTHON - the python executable to use. PYTHON=python
Expand Down
2 changes: 1 addition & 1 deletion kfp/doc/simple_transform_pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Ray cluster. For each step we have to define a component that will execute them:

```python
# components
base_kfp_image = "quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.2"
base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2"
# compute execution parameters. Here different tranforms might need different implementations. As
# a result, instead of creating a component we are creating it in place here.
compute_exec_params_op = comp.func_to_container_op(
Expand Down
4 changes: 2 additions & 2 deletions kfp/kfp_ray_components/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ ARG GIT_COMMIT

LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
LABEL data-prep-lab=${DPL_LIB_VERSION}
LABEL data-prep-lab-kfp=${DPL_LIB_KFP_VERSION}
LABEL data-prep-toolkit=${DPK_LIB_VERSION}
LABEL data-prep-toolkit-kfp=${DPK_LIB_KFP_VERSION}

# install libraries
COPY requirements.txt requirements.txt
Expand Down
13 changes: 8 additions & 5 deletions kfp/kfp_ray_components/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,42 @@ IGNORE := $(shell bash -c "sed -n /=/p ${REPOROOT}/kfp/requirements.env | sed '

include makeenv
DOCKER_FILE=Dockerfile
DOCKER_NAME=data-prep-lab/kfp-data-processing
DOCKER_NAME=kfp-data-processing
IMG=${DOCKER_HOSTNAME}/${DOCKER_NAMESPACE}/${DOCKER_NAME}:${KFP_DOCKER_VERSION}

# Create the docker image making sure the preloaded models are available to copy into the image
.kfp_comp.image:: Dockerfile requirements.txt
$(call check_defined, DOCKER_HOSTNAME)
sed -i.back "s/data-prep-lab-kfp==[0-9].*/data-prep-lab-kfp==${DPL_LIB_KFP_VERSION}/" requirements.txt
@# Help: Build the docker image using the $(DOCKER_FILE) and requirements.txt
$(call check_defined, DOCKER_HOSTNAME)
sed -i.back "s/data-prep-toolkit-kfp==[0-9].*/data-prep-toolkit-kfp==${DPK_LIB_KFP_VERSION}/" requirements.txt
$(DOCKER) build -t ${IMG} \
--build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \
--build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . --no-cache

image:: .kfp_comp.image
make reconcile-requirements
$(MAKE) reconcile-requirements

reconcile-requirements::
@# Help: Update yaml files to build images tagged as version $(KFP_DOCKER_VERSION)
sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent.yaml
sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" createRayComponent.yaml
sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" cleanupRayComponent.yaml
sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeRayJobComponent_multi_s3.yaml
sed -i.back "s/kfp-data-processing:[0-9].*/kfp-data-processing:${KFP_DOCKER_VERSION}/" executeSubWorkflowComponent.yaml

load-image:
@# Help: Load the image to the kind cluster created with make setup.
@# Help: Load the image to the kind cluster created with make setup.
kind load docker-image $(IMG) --name=$(KIND_CLUSTER_NAME)

build:: image

publish::
@# Help: Push $(IMG) to the registry
$(DOCKER) push ${IMG}

test::

clean::
@# Help: Remove $(IMG)
$(DOCKER) image rm ${IMG} || true
-rm makeenv
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/cleanupRayComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ inputs:

implementation:
container:
image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8
image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/createRayComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ inputs:

implementation:
container:
image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8
image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
2 changes: 1 addition & 1 deletion kfp/kfp_ray_components/executeRayJobComponent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ inputs:

implementation:
container:
image: quay.io/dataprep1/data-prep-lab/kfp-data-processing:0.0.8
image: quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.1.0
# command is a list of strings (command-line arguments).
# The YAML language has two syntaxes for lists and you can use either of them.
# Here we use the "flow syntax" - comma-separated strings inside square brackets.
Expand Down
Loading