From 802cff6d564bd42defa8a7b795e904a993032fc3 Mon Sep 17 00:00:00 2001 From: Shahin Saadati Date: Fri, 7 Oct 2022 10:20:25 -0700 Subject: [PATCH] Fix: Added "is_public" to cloud_datasets.tabular_datasets table (#501) --- .../infra/pdp_extract_tabular_metadata_pipeline.tf | 7 ++++++- .../_images/pdp_extract_tabular_metadata/script.py | 8 +++++++- .../pipelines/pdp_extract_tabular_metadata/pipeline.yaml | 5 +++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf b/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf index a02e78c45..a50f8c295 100644 --- a/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf +++ b/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf @@ -56,7 +56,12 @@ resource "google_bigquery_table" "_cloud_datasets_tabular_datasets" { "name": "num_tables", "description": "Number of tables contained in this dataset", "type": "INTEGER" - } + }, + { + "name": "is_public", + "description": "Whether or not the dataset is public to all users", + "type": "BOOLEAN" + } ] EOF depends_on = [ diff --git a/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py b/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py index 1b6e2de45..e587210ca 100644 --- a/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py +++ b/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py @@ -35,6 +35,7 @@ "dataset_id", "description", "num_tables", + "is_public", ] TABLES_COLUMNS = [ @@ -148,6 +149,7 @@ class DatasetInfo: dataset_id: str = None description: str = None num_tables: int = None + is_public: bool = None def __init__( self, @@ -161,6 +163,10 @@ def __init__( self.description = np.nan self.created_at = dataset_reference.created self.modified_at = dataset_reference.modified + entries = list(dataset_reference.access_entries) + self.is_public = any( + map(lambda e: e.entity_id in {"allAuthenticatedUsers", "allUsers"}, entries) + ) def __repr__(self) -> str: return f"{self.project_id}.{self.dataset_id}" @@ -344,7 +350,7 @@ def main( extractor.write_datasets_to_bq(tabular_dataset_table_name, extracted) extractor.write_tables_to_bq(tables_table_name, extracted) extractor.write_tables_fields_to_bq(tables_fields_table_name, extracted) - logging.info("Total time to run this function: ", time.time() - st) + logging.info("Total time to run this function: %s", time.time() - st) if __name__ == "__main__": diff --git a/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml b/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml index 385326c27..65740f32e 100644 --- a/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml +++ b/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml @@ -54,6 +54,11 @@ resources: "name": "num_tables", "description": "Number of tables contained in this dataset", "type": "INTEGER" + }, + { + "name": "is_public", + "description": "Whether or not the dataset is public to all users", + "type": "BOOLEAN" } ] - type: bigquery_table