From b8f955dbce85f9bd967647d4df567bbff83139ac Mon Sep 17 00:00:00 2001 From: Shivdeep Singh Date: Fri, 24 May 2024 09:55:59 +0530 Subject: [PATCH] Add gh-action for pre-commit Add gh-action for pre-commit after fixing pre-commit errors Signed-off-by: Shivdeep Singh --- .github/workflows/pre-commit.yml | 16 ++++++++++++++++ .../pure_python/transform_file_processor.py | 6 ++---- .../data_processing/transform/table_transform.py | 4 +++- .../transform/transform_configuration.py | 4 +++- .../src/execute_ray_job_multi_s3.py | 2 +- .../workflow_support/utils/components_utils.py | 6 +++++- .../universal/fdedup/ray/src/fdedup_transform.py | 2 +- .../tokenization/ray/src/tokenization_utils.py | 6 +++--- .../ray/test-data/ds02/expected/metadata.json | 4 ++-- 9 files changed, 36 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/pre-commit.yml diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 000000000..5c8ed6dcb --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,16 @@ +name: Run Pre-commit + +on: + workflow_dispatch: + push: + pull_request: + +jobs: + run-pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run pre-commit + run: | + pip install pre-commit + pre-commit run --all-files diff --git a/data-processing-lib/ray/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/ray/src/data_processing/runtime/pure_python/transform_file_processor.py index 25fee3b2e..ac7ce56ee 100644 --- a/data-processing-lib/ray/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -13,11 +13,9 @@ from typing import Any from data_processing.data_access import DataAccessFactoryBase -from data_processing.runtime import ( - AbstractTransformFileProcessor, -) -from data_processing.transform import TransformStatistics +from data_processing.runtime import AbstractTransformFileProcessor from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration +from data_processing.transform import TransformStatistics class PythonTransformFileProcessor(AbstractTransformFileProcessor): diff --git a/data-processing-lib/ray/src/data_processing/transform/table_transform.py b/data-processing-lib/ray/src/data_processing/transform/table_transform.py index f1566b6eb..63243c1e1 100644 --- a/data-processing-lib/ray/src/data_processing/transform/table_transform.py +++ b/data-processing-lib/ray/src/data_processing/transform/table_transform.py @@ -59,7 +59,9 @@ def transform_binary(self, byte_array: bytes, ext: str) -> tuple[list[tuple[byte # Add number of rows to stats stats = stats | {"source_doc_count": table.num_rows} # convert tables to files - return self._check_and_convert_tables(out_tables=out_tables, stats=stats | {"source_doc_count": table.num_rows}) + return self._check_and_convert_tables( + out_tables=out_tables, stats=stats | {"source_doc_count": table.num_rows} + ) def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]: """ diff --git a/data-processing-lib/ray/src/data_processing/transform/transform_configuration.py b/data-processing-lib/ray/src/data_processing/transform/transform_configuration.py index fe70616d9..7f99eb54b 100644 --- a/data-processing-lib/ray/src/data_processing/transform/transform_configuration.py +++ b/data-processing-lib/ray/src/data_processing/transform/transform_configuration.py @@ -22,7 +22,9 @@ class TransformConfiguration(CLIArgumentProvider): This is a base transform configuration class defining transform's input/output parameter """ - def __init__(self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = []): + def __init__( + self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = [] + ): """ Initialization :param name: transformer name diff --git a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py index 3a8c07983..b20286602 100644 --- a/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py +++ b/kfp/kfp_ray_components/src/execute_ray_job_multi_s3.py @@ -43,7 +43,7 @@ access_key, secret_key, url = KFPUtils.credentials() # add s3 credentials to exec params exec_params["data_s3_cred"] = ( - "{'access_key': '" + access_key + "', 'secret_key': '" + secret_key + "', 'url': '" + url + "'}" + "{'access_key': '" + access_key + "', 'secret_key': '" + secret_key + "', 'url': '" + url + "'}" ) # extra credentials prefix = args.prefix diff --git a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py b/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py index de84c176d..a4021e48f 100644 --- a/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py +++ b/kfp/kfp_support_lib/src/kfp_support/workflow_support/utils/components_utils.py @@ -52,7 +52,11 @@ def add_settings_to_component( def set_s3_env_vars_to_component( component: dsl.ContainerOp, secret: str, - env2key: dict[str, str] = {"S3_KEY": "s3-key", "S3_SECRET": "s3-secret", "ENDPOINT": "s3-endpoint"}, + env2key: dict[str, str] = { + "S3_KEY": "s3-key", + "S3_SECRET": "s3-secret", # pragma: allowlist secret + "ENDPOINT": "s3-endpoint", + }, prefix: str = None, ) -> None: """ diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform.py b/transforms/universal/fdedup/ray/src/fdedup_transform.py index 1d9a6e537..09fc5d84f 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_transform.py +++ b/transforms/universal/fdedup/ray/src/fdedup_transform.py @@ -22,8 +22,8 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime.ray import ( DefaultRayTransformRuntime, - RayTransformLauncher, RayTransformFileProcessor, + RayTransformLauncher, RayUtils, ) from data_processing.runtime.ray.runtime_configuration import ( diff --git a/transforms/universal/tokenization/ray/src/tokenization_utils.py b/transforms/universal/tokenization/ray/src/tokenization_utils.py index 18f08ac98..64ecd2302 100644 --- a/transforms/universal/tokenization/ray/src/tokenization_utils.py +++ b/transforms/universal/tokenization/ray/src/tokenization_utils.py @@ -59,14 +59,14 @@ def _split_text_with_word_space(text: str, chunk_size: int) -> str: if last_space_index != -1: # s[last_space_index] = ' ' # If found, return the chunk up to and include such space: - yield text[index: last_space_index + 1] + yield text[index : last_space_index + 1] index = last_space_index + 1 else: # If not, force cutting up to chunk_size: - yield text[index: index + chunk_size] + yield text[index : index + chunk_size] index += chunk_size else: - yield text[index: index + chunk_size] + yield text[index : index + chunk_size] index += chunk_size diff --git a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json b/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json index dc9813beb..96dd2fe6f 100644 --- a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json +++ b/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json @@ -48,11 +48,11 @@ "num_chars": 16836009 }, "source": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds02/input", + "name": "fm-data-engineering/transforms/universal/tokenization/test-data/ds02/input", "type": "path" }, "target": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds02", + "name": "fm-data-engineering/transforms/universal/tokenization/output/ds02", "type": "path" } }