Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

data factory fixes #9

Merged
merged 2 commits into from
Apr 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
################################################################################

# Data prep lab wheel version
DPL_LIB_VERSION=0.1.3
DPL_LIB_KFP_VERSION=0.1.4
DPL_LIB_VERSION=0.1.4
DPL_LIB_KFP_VERSION=0.1.5

# Begin transform versions/tags
BLOCKLIST_VERSION=0.2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from datetime import datetime

import ray
from data_processing.data_access import DataAccessFactory
from data_processing.data_access import DataAccessFactoryBase
from data_processing.ray import (
DefaultTableTransformConfiguration,
RayUtils,
Expand All @@ -34,7 +34,7 @@
@ray.remote(num_cpus=1, scheduling_strategy="SPREAD")
def orchestrate(
preprocessing_params: TransformOrchestratorConfiguration,
data_access_factory: DataAccessFactory,
data_access_factory: DataAccessFactoryBase,
transform_runtime_config: DefaultTableTransformConfiguration,
) -> int:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import argparse
from typing import Any

from data_processing.data_access import DataAccessFactory
from data_processing.data_access import DataAccessFactoryBase
from data_processing.transform import AbstractTableTransform
from data_processing.utils import CLIArgumentProvider
from ray.actor import ActorHandle
Expand All @@ -32,7 +32,7 @@ def __init__(self, params: dict[str, Any]):
self.params = params

def get_transform_config(
self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
) -> dict[str, Any]:
"""
Get the dictionary of configuration that will be provided to the transform's initializer.
Expand Down
2 changes: 0 additions & 2 deletions transforms/code/code_quality/src/code_quality_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@

import argparse
import os
import sys

import numpy as np
import pyarrow
import pyarrow as pa
from bs4 import BeautifulSoup
from data_processing.ray import DefaultTableTransformConfiguration, TransformLauncher
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import pyarrow as pa
import ray
from data_processing.data_access import DataAccess, DataAccessFactory, DataAccessLocal
from data_processing.data_access import DataAccess, DataAccessFactoryBase, DataAccessFactory
from data_processing.ray import (
DefaultTableTransformConfiguration,
DefaultTableTransformRuntime,
Expand Down Expand Up @@ -120,7 +120,7 @@ def __init__(self, params: dict[str, Any]):

def get_transform_config(
self,
data_access_factory: DataAccessFactory,
data_access_factory: DataAccessFactoryBase,
statistics: ActorHandle,
files: list[str],
) -> dict[str, Any]:
Expand Down
4 changes: 2 additions & 2 deletions transforms/universal/doc_id/src/doc_id_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import pyarrow as pa
import ray
from data_processing.data_access import DataAccessFactory
from data_processing.data_access import DataAccessFactoryBase
from data_processing.ray import (
DefaultTableTransformConfiguration,
DefaultTableTransformRuntime,
Expand Down Expand Up @@ -128,7 +128,7 @@ def __init__(self, params: dict[str, Any]):
super().__init__(params)

def get_transform_config(
self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
) -> dict[str, Any]:
"""
Set environment for filter execution
Expand Down
4 changes: 2 additions & 2 deletions transforms/universal/ededup/src/ededup_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import pyarrow as pa
import ray
from data_processing.data_access import DataAccessFactory
from data_processing.data_access import DataAccessFactoryBase
from data_processing.ray import (
DefaultTableTransformConfiguration,
DefaultTableTransformRuntime,
Expand Down Expand Up @@ -178,7 +178,7 @@ def __init__(self, params: dict[str, Any]):
self.filters = []

def get_transform_config(
self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
) -> dict[str, Any]:
"""
Set environment for transform execution
Expand Down
12 changes: 6 additions & 6 deletions transforms/universal/fdedup/src/fdedup_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import numpy as np
import pyarrow as pa
import ray
from data_processing.data_access import DataAccessFactory
from data_processing.data_access import DataAccessFactoryBase
from data_processing.ray import (
DefaultTableTransformConfiguration,
DefaultTableTransformRuntime,
Expand Down Expand Up @@ -334,7 +334,7 @@ def __init__(self, params: dict[str, Any]):
self.random_delay_limit = self.params.get("random_delay_limit", 10)

def get_transform_config(
self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
) -> dict[str, Any]:
"""
Set environment for filter execution
Expand Down Expand Up @@ -369,7 +369,7 @@ def get_transform_config(
}

def _create_doc_actors(
self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
) -> None:
"""
Create document actors
Expand Down Expand Up @@ -420,7 +420,7 @@ def _create_doc_actors(
)

def _create_doc_actors_internal(
self, data_access_factory: DataAccessFactory, statistics: ActorHandle, mn_min_hash: MurmurMH, files: list[str]
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, mn_min_hash: MurmurMH, files: list[str]
) -> None:
"""
Create document actors
Expand Down Expand Up @@ -495,7 +495,7 @@ def _create_doc_actors_internal(

def _process_buckets(
self,
data_access_factory: DataAccessFactory,
data_access_factory: DataAccessFactoryBase,
statistics: ActorHandle,
bucket_collectors: list[ActorHandle],
minhash_collectors: list[ActorHandle],
Expand Down Expand Up @@ -591,7 +591,7 @@ def _process_buckets(

def _preprocess_tables(
self,
data_access_factory: DataAccessFactory,
data_access_factory: DataAccessFactoryBase,
statistics: ActorHandle,
files: list[str],
mn_min_hash: MurmurMH,
Expand Down
2 changes: 1 addition & 1 deletion transforms/universal/filter/src/filter_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pyarrow as pa
from data_processing.ray import DefaultTableTransformConfiguration, TransformLauncher
from data_processing.transform import AbstractTableTransform
from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger
from data_processing.utils import CLIArgumentProvider, get_logger


logger = get_logger(__name__)
Expand Down