From 802cff6d564bd42defa8a7b795e904a993032fc3 Mon Sep 17 00:00:00 2001
From: Shahin Saadati <happyhuman@users.noreply.github.com>
Date: Fri, 7 Oct 2022 10:20:25 -0700
Subject: [PATCH] Fix: Added "is_public" to cloud_datasets.tabular_datasets
 table (#501)

---
 .../infra/pdp_extract_tabular_metadata_pipeline.tf        | 7 ++++++-
 .../_images/pdp_extract_tabular_metadata/script.py        | 8 +++++++-
 .../pipelines/pdp_extract_tabular_metadata/pipeline.yaml  | 5 +++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf b/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf
index a02e78c45..a50f8c295 100644
--- a/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf
+++ b/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf
@@ -56,7 +56,12 @@ resource "google_bigquery_table" "_cloud_datasets_tabular_datasets" {
       "name": "num_tables",
       "description": "Number of tables contained in this dataset",
       "type": "INTEGER"
-  }
+  },
+  {
+      "name": "is_public",
+      "description": "Whether or not the dataset is public to all users",
+      "type": "BOOLEAN"
+  }      
 ]
     EOF
   depends_on = [
diff --git a/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py b/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py
index 1b6e2de45..e587210ca 100644
--- a/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py
+++ b/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/script.py
@@ -35,6 +35,7 @@
     "dataset_id",
     "description",
     "num_tables",
+    "is_public",
 ]
 
 TABLES_COLUMNS = [
@@ -148,6 +149,7 @@ class DatasetInfo:
     dataset_id: str = None
     description: str = None
     num_tables: int = None
+    is_public: bool = None
 
     def __init__(
         self,
@@ -161,6 +163,10 @@ def __init__(
             self.description = np.nan
         self.created_at = dataset_reference.created
         self.modified_at = dataset_reference.modified
+        entries = list(dataset_reference.access_entries)
+        self.is_public = any(
+            map(lambda e: e.entity_id in {"allAuthenticatedUsers", "allUsers"}, entries)
+        )
 
     def __repr__(self) -> str:
         return f"{self.project_id}.{self.dataset_id}"
@@ -344,7 +350,7 @@ def main(
         extractor.write_datasets_to_bq(tabular_dataset_table_name, extracted)
         extractor.write_tables_to_bq(tables_table_name, extracted)
         extractor.write_tables_fields_to_bq(tables_fields_table_name, extracted)
-    logging.info("Total time to run this function: ", time.time() - st)
+    logging.info("Total time to run this function: %s", time.time() - st)
 
 
 if __name__ == "__main__":
diff --git a/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml b/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml
index 385326c27..65740f32e 100644
--- a/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml
+++ b/datasets/cloud_datasets/pipelines/pdp_extract_tabular_metadata/pipeline.yaml
@@ -54,6 +54,11 @@ resources:
             "name": "num_tables",
             "description": "Number of tables contained in this dataset",
             "type": "INTEGER"
+        },
+        {
+            "name": "is_public",
+            "description": "Whether or not the dataset is public to all users",
+            "type": "BOOLEAN"
         }
       ]
   - type: bigquery_table