Skip to content

Commit

Permalink
Merge pull request #181 from revit13/minor12
Browse files Browse the repository at this point in the history
Fail workflow if input size is empty.
  • Loading branch information
roytman committed May 23, 2024
2 parents 9a0133d + 6a266cc commit 3f102d0
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def ededup_compute_execution_params(
sampling = data_access.sample_input_data(n_samples=n_samples)
avg_doc_size = sampling.get("average doc size KB")
number_of_docs = sampling.get("estimated number of docs")
if number_of_docs == 0:
print(f"Estimated number of documents and documents size is zero. Please verify the input path.")
sys.exit(1)
avg_table_size = sampling.get("average table size MB") / KB
# compute number of hashes
n_hashes = math.ceil(number_of_docs * 32 / GB)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ def _false_negative_probability(ths: float, b: int, r: int) -> float:
avg_doc_size = sampling.get("average doc size KB")
number_of_docs = sampling.get("estimated number of docs")
avg_table_size = sampling.get("average table size MB") / KB
if number_of_docs == 0:
print(f"Estimated number of documents and documents size is zero. Please verify the input path.")
sys.exit(1)
# we are creating more buckets actors, so that we get better parallelization for bucket processing
b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB)
d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB)
Expand Down

0 comments on commit 3f102d0

Please sign in to comment.