Merge pull request #288 from IBM/runtime-reorg

Split code quality, malware and proglang select transforms into python and ray.
IBM · Jun 21, 2024 · dda6e7c · dda6e7c
2 parents 399d2c1 + 672e4a8
commit dda6e7c
Show file tree

Hide file tree

Showing 77 changed files with 2,282 additions and 881 deletions.
diff --git a/.make.defaults b/.make.defaults
@@ -310,10 +310,10 @@ __check_defined = \
 	$(MAKE) PIP_TARGET=data-prep-toolkit-ray .defaults.pip-uninstall;	\
 	$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv;		\
 	$(MAKE) PYTHON_PROJECT_DIR=$(DPK_RAY_LIB_DIR) .defaults.install-src-venv;		\
-	if [ -d ../python ]; then                                               \
-		$(MAKE) PYTHON_PROJECT_DIR=../python .defaults.install-src-venv;    \
+	echo Installed source from Ray data processing library for `which $(PYTHON)`;		\
+	if [ -d ../python ]; then                                               		\
+		$(MAKE) PYTHON_PROJECT_DIR=../python .defaults.install-src-venv;    		\
 	fi 
-	echo Installed source from Ray data processing library for `which $(PYTHON)`
 
 # Install local requirements last as it generally includes our lib source
 .PHONY: .defaults.spark-lib-src-venv

diff --git a/.make.versions b/.make.versions
@@ -43,10 +43,12 @@ LANG_ID_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
 TOKENIZATION_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
 TOKENIZATION_PYTHON_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
 
+MALWARE_PYTHON_VERSION=0.5.0$(RELEASE_VERSION_SUFFIX)
 MALWARE_RAY_VERSION=0.5.0$(RELEASE_VERSION_SUFFIX)
 
 PROGLANG_SELECT_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
 
+CODE_QUALITY_PYTHON_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
 CODE_QUALITY_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
 
 INGEST_TO_PARQUET_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)

diff --git a/transforms/code/code_quality/README.md b/transforms/code/code_quality/README.md
@@ -0,0 +1,13 @@
+# Code Quality Transform 
+The Code Quality transforms 
+captures code specific metrics of input data.
+Per the set of 
+[transform project conventions](../../README.md#transform-project-conventions)
+the following runtimes are available:
+
+* [python](python/README.md) - provides the base python-based transformation 
+implementation.
+* [ray](ray/README.md) - enables the running of the base python transformation
+in a Ray runtime
+* [kfp](kfp_ray/README.md) - enables running the ray docker image 
+in a kubernetes cluster using a generated `yaml` file.
diff --git a/transforms/code/code_quality/python/.dockerignore b/transforms/code/code_quality/python/.dockerignore
@@ -0,0 +1 @@
+venv/
diff --git a/transforms/code/code_quality/python/.gitignore b/transforms/code/code_quality/python/.gitignore
@@ -0,0 +1,37 @@
+test-data/output
+output/*
+/output/
+data-processing-lib/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.tox/
+htmlcov
+.coverage
+.cache
+nosetests.xml
+coverage.xml
diff --git a/transforms/code/code_quality/python/Dockerfile b/transforms/code/code_quality/python/Dockerfile
@@ -0,0 +1,42 @@
+FROM docker.io/python:3.10.14-slim-bullseye
+
+RUN pip install --upgrade pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+
+# Create a user and use it to run the transform
+RUN useradd -ms /bin/bash dpk
+USER dpk
+WORKDIR /home/dpk
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
+RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
+
+# END OF STEPS destined for a data-prep-kit base image 
+
+COPY --chown=dpk:root src/ src/
+COPY --chown=dpk:root pyproject.toml pyproject.toml 
+RUN pip install --no-cache-dir -e .
+
+#COPY requirements.txt requirements.txt
+#RUN pip install --no-cache-dir -r  requirements.txt
+
+# copy source data
+COPY ./src/code_quality_transform_python.py .
+COPY ./src/code_quality_local.py local/
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+# Set environment
+ENV PYTHONPATH /home/dpk
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/code/code_quality/python/Makefile b/transforms/code/code_quality/python/Makefile
@@ -0,0 +1,51 @@
+
+# Define the root of the local git clone for the common rules to be able
+# know where they are running from.
+REPOROOT=../../../..
+include $(REPOROOT)/transforms/.make.transforms
+
+TRANSFORM_NAME=code_quality
+# $(REPOROOT)/.make.versions file contains the versions
+DOCKER_IMAGE_VERSION=${CODE_QUALITY_RAY_VERSION}
+
+# Use default rule inherited from makefile.common
+clean:: .transforms.clean
+
+# Use default rule inherited from makefile.common
+test:: .transforms.python-test
+
+# Use default rule inherited from makefile.common
+image:: .transforms.python-image
+
+# Use default rule inherited from makefile.common
+venv:: .transforms.python-venv
+
+test-src:: .transforms.test-src
+
+test-image:: .transforms.python-test-image
+
+build:: build-dist image
+
+publish:: publish-dist publish-image
+
+publish-image:: .transforms.publish-image-python
+
+setup:: .transforms.setup
+
+# distribution versions is the same as image version.
+set-versions:
+	$(MAKE) TOML_VERSION=$(DOCKER_IMAGE_VERSION) .defaults.update-toml
+
+build-dist:: set-versions .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+setup:: .transforms.setup
+
+run-cli-sample: .transforms.run-cli-python-sample
+
+run-local-sample: .transforms.run-local-sample
+
+run-local-python-sample: .transforms.run-local-python-sample
+
+load-image:: .transforms.load-image
diff --git a/transforms/code/code_quality/python/README.md b/transforms/code/code_quality/python/README.md
@@ -0,0 +1,67 @@
+# Code Quality 
+
+Please see the set of
+[transform project conventions](../../../README.md)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary
+This module captures code specific metrics of input data. The implementation is borrowed from the work done in [CodeParrot](https://huggingface.co/blog/codeparrot) and [StarCoder](https://arxiv.org/abs/2305.06161) projects. In the current implementation, the module includes the following metrics & reports each metrics in individual column:
+
+* line specific metrics include mean & max line length
+* character and token ratio - uses the input tokenizer to tokenize the input data & measure the ratio between the characters and tokens
+* identifies the high occurrence of the keywords "test " or "config" and tags them as config or test samples
+* tags the samples as autogenerated if the sample contains keywords like `auto-generated`, `autogenerated` or `automatically generated`
+* programming language specific identification, where:
+    * if the input sample is `python` programming language and sample has no reference to constructs like def, class, it is highlighted as `has_no_keywords` 
+
+This module adds the following fields into the output file:
+<ul>
+       <li>line_mean</li>
+       <li>line_max</li>
+       <li>total_num_lines</li>
+       <li>avg_longest_lines</li>
+       <li>alphanum_frac</li>
+       <li>char_token_ratio</li>
+       <li>autogenerated</li>
+       <li>config_or_test</li>
+       <li>has_no_keywords</li>
+       <li>has_few_assignments</li>
+       <li>is_xml</li>
+       <li>is_html</li>
+</ul>
+
+It uses a tokenizer to collect metrics specific to token ratio.  It is designed to download the tokenizer from the [Huggingface](https://huggingface.co/) if the input tokenizer is not found in the local cache. By default, it uses [codeparrot/codeparrot](https://huggingface.co/codeparrot/codeparrot) tokenizer.
+
+## Running
+
+### Launcher Command Line Options 
+
+The following command line arguments are available in addition to 
+the options provided by the [ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md)
+and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
+
+* "--contents_column_name" - input a column name which contains data to process. The default column name: `contents`
+* "--language_column_name" - input a column name which contains programming language details. The default column name: `language`
+* "--tokenizer" - input a tokenizer to convert the data into tokens. The default tokenizer is `codeparrot/codeparrot`
+* "--hf_token" - input the Hugging Face auth token to download the tokenizer. This option is only required for the tokenizer's whose access is restricted in Hugging Face.
+
+### Running the samples
+To run the samples, use the following `make` targets
+
+* `run-cli-sample` - runs src/code_quality_transform_python.py using command line args
+* `run-local-sample` - runs src/code_quality_local_python.py
+
+These targets will activate the virtual environment and set up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-cli-sample
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml
@@ -0,0 +1,46 @@
+[project]
+name = "dpk_code_quality_transform_python"
+version = "0.4.0.dev6"
+requires-python = ">=3.10"
+description = "Code Quality Python Transform"
+license = {text = "Apache-2.0"}
+readme = {file = "README.md", content-type = "text/markdown"}
+authors = [
+    { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
+]
+dependencies = [
+    "data-prep-toolkit==0.2.0.dev6",
+    "bs4==0.0.2",
+    "transformers==4.38.2",
+]
+
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+dev = [
+    "twine",
+    "pytest>=7.3.2",
+    "pytest-dotenv>=0.5.2",
+    "pytest-env>=1.0.0",
+    "pre-commit>=3.3.2",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.10.0",
+    "moto==5.0.5",
+    "markupsafe==2.0.1",
+]
+
+[options]
+package_dir = ["src","test"]
+
+[options.packages.find]
+where = ["src/"]
+
+[tool.pytest.ini_options]
+# Currently we use low coverage since we have to run tests separately (see makefile)
+#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
+markers = ["unit: unit tests", "integration: integration tests"]
+
+[tool.coverage.run]
+include = ["src/*"]
diff --git a/...ode_quality/ray/src/code_quality_local.py → ..._quality/python/src/code_quality_local.py b/...ode_quality/ray/src/code_quality_local.py → ..._quality/python/src/code_quality_local.py
@@ -12,7 +12,7 @@
 
 import os
 
-from code_quality_transform_ray import CodeQualityTransform
+from code_quality_transform import CodeQualityTransform
 from data_processing.data_access import DataAccessLocal
 
 

diff --git a/...uality/ray/src/code_quality_local_pure.py → ...y/python/src/code_quality_local_python.py b/...uality/ray/src/code_quality_local_pure.py → ...y/python/src/code_quality_local_python.py
@@ -14,7 +14,7 @@
 import sys
 from pathlib import Path
 
-from code_quality_transform_ray import CodeQualityTransformConfiguration
+from code_quality_transform import CodeQualityTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils