diff --git a/datasets/open_buildings/infra/open_buildings_dataset.tf b/datasets/open_buildings/infra/open_buildings_dataset.tf new file mode 100644 index 000000000..8730c40f2 --- /dev/null +++ b/datasets/open_buildings/infra/open_buildings_dataset.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "open_buildings" { + dataset_id = "open_buildings" + project = var.project_id + description = "A dataset of building footprints to support social good applications. This large-scale open dataset contains the outlines of buildings derived from high-resolution satellite imagery in order to support these types of uses. The project being based in Ghana, the current focus is on the continent of Africa." +} + +output "bigquery_dataset-open_buildings-dataset_id" { + value = google_bigquery_dataset.open_buildings.dataset_id +} diff --git a/datasets/open_buildings/infra/provider.tf b/datasets/open_buildings/infra/provider.tf new file mode 100644 index 000000000..dfb989c88 --- /dev/null +++ b/datasets/open_buildings/infra/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/open_buildings/infra/variables.tf b/datasets/open_buildings/infra/variables.tf new file mode 100644 index 000000000..46c6dacda --- /dev/null +++ b/datasets/open_buildings/infra/variables.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} +variable "iam_policies" { + default = {} +} + diff --git a/datasets/open_buildings/pipelines/_images/run_script_kub/Dockerfile b/datasets/open_buildings/pipelines/_images/run_script_kub/Dockerfile new file mode 100644 index 000000000..797ecc506 --- /dev/null +++ b/datasets/open_buildings/pipelines/_images/run_script_kub/Dockerfile @@ -0,0 +1,38 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The base image for this build +FROM python:3.8 + +# Allow statements and log messages to appear in Cloud logs +ENV PYTHONUNBUFFERED True + +# Copy the requirements file into the image +COPY requirements.txt ./ + +# Install the packages specified in the requirements file +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +# The WORKDIR instruction sets the working directory for any RUN, CMD, +# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile. +# If the WORKDIR doesn’t exist, it will be created even if it’s not used in +# any subsequent Dockerfile instruction +WORKDIR /custom + +# Copy the specific data processing script/s in the image under /custom/* +COPY ./script.py . +COPY ./schema.json . + +# Command to run the data processing script when the container is run +CMD ["python3", "script.py"] diff --git a/datasets/open_buildings/pipelines/_images/run_script_kub/requirements.txt b/datasets/open_buildings/pipelines/_images/run_script_kub/requirements.txt new file mode 100644 index 000000000..b8d99f1d5 --- /dev/null +++ b/datasets/open_buildings/pipelines/_images/run_script_kub/requirements.txt @@ -0,0 +1,6 @@ +requests +pandas +google-cloud-storage +google-cloud-bigquery +numpy + diff --git a/datasets/open_buildings/pipelines/_images/run_script_kub/schema.json b/datasets/open_buildings/pipelines/_images/run_script_kub/schema.json new file mode 100644 index 000000000..8aecf1b09 --- /dev/null +++ b/datasets/open_buildings/pipelines/_images/run_script_kub/schema.json @@ -0,0 +1,38 @@ +[ + { + "name": "latitude", + "type": "STRING", + "mode": "NULLABLE", + "description": "This represents the latitude of the building polygon centroid." + }, + { + "name": "longitude", + "type": "STRING", + "mode": "NULLABLE", + "description": "This represents the longitude of the building polygon centroid." + }, + { + "name": "area_in_meters", + "type": "STRING", + "mode": "NULLABLE", + "description": "This represents the area in square meters of the polygon." + }, + { + "name": "confidence", + "type": "STRING", + "mode": "NULLABLE", + "description": "This represents the confidence score [0.5;1.0] assigned by the model." + }, + { + "name": "geometry", + "type": "STRING", + "mode": "NULLABLE", + "description": "This represents the building polygon in the WKT format (POLYGON)." + }, + { + "name": "full_plus_code", + "type": "STRING", + "mode": "NULLABLE", + "description": "This represents the full Plus Code(latitude longitude based) at the building polygon centroid." + } +] diff --git a/datasets/open_buildings/pipelines/_images/run_script_kub/script.py b/datasets/open_buildings/pipelines/_images/run_script_kub/script.py new file mode 100644 index 000000000..83b50f68e --- /dev/null +++ b/datasets/open_buildings/pipelines/_images/run_script_kub/script.py @@ -0,0 +1,184 @@ +import json +import logging +import os + +from google.api_core.exceptions import NotFound +from google.cloud import bigquery, storage + + +def main(source_gcs_path, project_id, dataset_id, gcs_bucket, schema_filepath) -> None: + source_file_names = fetch_gcs_file_names(source_gcs_path, gcs_bucket) + for each_file in source_file_names: + pipeline_name = each_file + table_id = each_file[:-4] + logging.info(f"Started Extraction and Load process for {pipeline_name} --->") + execute_pipeline( + source_gcs_path, + project_id, + dataset_id, + gcs_bucket, + pipeline_name, + table_id, + schema_filepath, + ) + logging.info(f"Finished process for {pipeline_name}") + print() + logging.info("Cleaning up extracted csv files in GCS. Source csv.gz files present.") + cleanup(gcs_bucket, source_gcs_path) + + +def fetch_gcs_file_names(source_gcs_path, gcs_bucket): + client = storage.Client() + blobs = client.list_blobs(gcs_bucket, prefix=source_gcs_path) + source_file_names = [] + for blob in blobs: + if blob.name.endswith(".csv"): + source_file_names.append(blob.name.split("/")[-1]) + logging.info(f"{len(source_file_names)} tables to be loaded in bq") + return source_file_names + + +def execute_pipeline( + source_gcs_path, + project_id, + dataset_id, + gcs_bucket, + pipeline_name, + table_id, + schema_filepath, +): + logging.info(f"ETL started for {pipeline_name}") + client = storage.Client() + blob = client.list_blobs(gcs_bucket, prefix=source_gcs_path + pipeline_name) + if blob: + table_exists = create_dest_table( + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + gcs_bucket=gcs_bucket, + schema_filepath=schema_filepath, + drop_table=True, + ) + if table_exists: + load_data_to_bq( + project_id=project_id, + dataset_id=dataset_id, + table_id=table_id, + gcs_bucket=gcs_bucket, + source_gcs_path=source_gcs_path, + truncate_table=True, + field_delimiter="|", + ) + else: + error_msg = f"Error: Data was not loaded because the destination table {project_id}.{dataset_id}.{table_id} does not exist and/or could not be created." + raise ValueError(error_msg) + else: + logging.info(f"Informational: The data file {blob} is unavailable") + + +def create_dest_table( + project_id: str, + dataset_id: str, + table_id: str, + gcs_bucket: str, + schema_filepath: str, + drop_table: bool, +) -> bool: + table_ref = f"{project_id}.{dataset_id}.{table_id}" + logging.info(f"Attempting to create table {table_ref} if it doesn't already exist") + client = bigquery.Client() + try: + table = client.get_table(table_ref) + table_exists_id = table.table_id + logging.info(f"Table {table_exists_id} currently exists.") + if drop_table: + logging.info("Dropping existing table") + client.delete_table(table) + table = None + except NotFound: + table = None + if not table: + logging.info( + f"Table {table_ref} currently does not exist. Attempting to create table." + ) + if schema_filepath: + schema = create_table_schema(schema_filepath) + table = bigquery.Table(table_ref, schema=schema) + client.create_table(table) + logging.info(f"Table {table_id} was created") + table_exists = True + else: + logging.info(f"Schema {schema_filepath} file not found") + table_exists = False + else: + table_exists = True + return table_exists + + +def create_table_schema(schema_filepath) -> list: + logging.info("Defining table schema") + schema = [] + with open(schema_filepath) as f: + sc = f.read() + schema_struct = json.loads(sc) + for schema_field in schema_struct: + fld_name = schema_field["name"] + fld_type = schema_field["type"] + try: + fld_descr = schema_field["description"] + except KeyError: + fld_descr = "" + fld_mode = schema_field["mode"] + schema.append( + bigquery.SchemaField( + name=fld_name, field_type=fld_type, mode=fld_mode, description=fld_descr + ) + ) + return schema + + +def load_data_to_bq( + project_id: str, + dataset_id: str, + table_id: str, + gcs_bucket: str, + source_gcs_path: str, + truncate_table: bool, + field_delimiter: str = "|", +) -> None: + logging.info( + f"Loading output data from {source_gcs_path} into {project_id}.{dataset_id}.{table_id} ...." + ) + client = bigquery.Client(project=project_id) + table_ref = f"{project_id}.{dataset_id}.{table_id}" + job_config = bigquery.LoadJobConfig( + skip_leading_rows=1, source_format=bigquery.SourceFormat.CSV + ) + job = client.load_table_from_uri( + f"gs://{gcs_bucket}/{source_gcs_path}{table_id}.csv", + table_ref, + job_config=job_config, + ) + logging.info(job.result()) + logging.info("Loading table completed") + + +def cleanup(gcs_bucket, source_gcs_path): + client = storage.Client() + pre = client.list_blobs(gcs_bucket, prefix=source_gcs_path) + bucket = client.bucket(gcs_bucket) + for i in pre: + if i.name.endswith(".csv"): + delblob = bucket.blob(i.name) + delblob.delete() + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + main( + source_gcs_path=os.environ.get("SOURCE_GCS_PATH"), + project_id=os.environ.get("PROJECT_ID"), + dataset_id=os.environ.get("DATASET_ID"), + gcs_bucket=os.environ.get("GCS_BUCKET"), + schema_filepath=os.environ.get("SCHEMA_FILEPATH"), + ) diff --git a/datasets/open_buildings/pipelines/dataset.yaml b/datasets/open_buildings/pipelines/dataset.yaml new file mode 100644 index 000000000..a56a5b493 --- /dev/null +++ b/datasets/open_buildings/pipelines/dataset.yaml @@ -0,0 +1,26 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + name: open_buildings + friendly_name: Open Buildings + description: A dataset of building footprints to support social good applications. This large-scale open dataset contains the outlines of buildings derived from high-resolution satellite imagery in order to support these types of uses. The project being based in Ghana, the current focus is on the continent of Africa. + dataset_sources: https://sites.research.google/open-buildings/#download + terms_of_use: ~ + + +resources: + - type: bigquery_dataset + dataset_id: open_buildings + description: A dataset of building footprints to support social good applications. This large-scale open dataset contains the outlines of buildings derived from high-resolution satellite imagery in order to support these types of uses. The project being based in Ghana, the current focus is on the continent of Africa. diff --git a/datasets/open_buildings/pipelines/open_buildings/open_buildings_dag.py b/datasets/open_buildings/pipelines/open_buildings/open_buildings_dag.py new file mode 100644 index 000000000..30bc82318 --- /dev/null +++ b/datasets/open_buildings/pipelines/open_buildings/open_buildings_dag.py @@ -0,0 +1,103 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.operators import bash +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2022-08-19", +} + + +with DAG( + dag_id="open_buildings.open_buildings", + default_args=default_args, + max_active_runs=1, + schedule_interval="@yearly", + catchup=False, + default_view="graph", +) as dag: + + # Fetch data gcs - gcs + bash_gcs_to_gcs = bash.BashOperator( + task_id="bash_gcs_to_gcs", + bash_command="gsutil cp -R gs://open-buildings-data/v1/polygons_s2_level_4_gzip gs://us-central1-dev-v2-cd7f5f38-bucket/data/open_buildings/source_files/", + ) + + # Unzip data + batch1_bash_gunzip = bash.BashOperator( + task_id="batch1_bash_gunzip", + bash_command="gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/025_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/04f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/05b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/093_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/095_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0c3_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0c5_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0c7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0d1_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0d7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0d9_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0db_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0dd_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0df_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e1_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e3_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e5_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e9_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0eb_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0ed_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0ef_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0f1_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0f9_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0fb_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0fd_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0ff_buildings.csv.gz ; ", + ) + + # Unzip data + batch2_bash_gunzip = bash.BashOperator( + task_id="batch2_bash_gunzip", + bash_command="gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/103_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/105_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/107_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/109_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/10b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/10d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/10f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/111_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/113_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/117_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/119_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/11b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/11d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/11f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/121_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/123_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/125_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/127_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/129_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/12f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/131_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/137_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/139_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/13b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/13d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/141_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/143_buildings.csv.gz ; ", + ) + + # Unzip data + batch3_bash_gunzip = bash.BashOperator( + task_id="batch3_bash_gunzip", + bash_command="gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/145_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/147_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/149_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/14f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/15b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/15d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/161_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/163_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/165_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/167_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/169_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/16b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/16d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/16f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/171_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/173_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/175_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/177_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/179_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/17b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/17d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/17f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/181_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/183_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/185_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/189_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/18b_buildings.csv.gz ; ", + ) + + # Unzip data + batch4_bash_gunzip = bash.BashOperator( + task_id="batch4_bash_gunzip", + bash_command="gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/18d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/18f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/191_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/193_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/195_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/197_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/199_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/19b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/19d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/19f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a1_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a3_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a5_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a9_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1b9_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1bb_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1bd_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1bf_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c1_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c3_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c5_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1dd_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1e5_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1e7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1e9_buildings.csv.gz ; ", + ) + + # Unzip data + batch5_bash_gunzip = bash.BashOperator( + task_id="batch5_bash_gunzip", + bash_command="gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1eb_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1ed_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1ef_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f1_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f3_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f5_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/217_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/219_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/21d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/21f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/221_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/223_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/225_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/227_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/22f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/231_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/233_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/23b_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/23d_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/23f_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d1_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d5_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d7_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d9_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3db_buildings.csv.gz ; gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/b5b_buildings.csv.gz ; ", + ) + + # ETL within the kubernetes pod + py_gcs_to_bq = kubernetes_pod.KubernetesPodOperator( + task_id="py_gcs_to_bq", + startup_timeout_seconds=1000, + name="load_data", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.open_buildings.container_registry.run_script_kub }}", + env_vars={ + "SOURCE_GCS_PATH": "{{ var.json.open_buildings.source_gcs_path }}", + "PROJECT_ID": "{{ var.json.open_buildings.project_id }}", + "DATASET_ID": "{{ var.json.open_buildings.dataset_id }}", + "GCS_BUCKET": "{{ var.json.open_buildings.gcs_bucket }}", + "SCHEMA_FILEPATH": "schema.json", + }, + resources={ + "request_memory": "2G", + "request_cpu": "1", + "request_ephemeral_storage": "10G", + }, + ) + + ( + bash_gcs_to_gcs + >> batch1_bash_gunzip + >> batch2_bash_gunzip + >> batch3_bash_gunzip + >> batch4_bash_gunzip + >> batch5_bash_gunzip + >> py_gcs_to_bq + ) diff --git a/datasets/open_buildings/pipelines/open_buildings/pipeline.yaml b/datasets/open_buildings/pipelines/open_buildings/pipeline.yaml new file mode 100644 index 000000000..2f002a861 --- /dev/null +++ b/datasets/open_buildings/pipelines/open_buildings/pipeline.yaml @@ -0,0 +1,220 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# requestations under the License. + +--- +resources: + + +dag: + airflow_version: 2 + initialize: + dag_id: open_buildings + default_args: + owner: "Google" + depends_on_past: False + start_date: "2022-08-19" + max_active_runs: 1 + schedule_interval: "@yearly" + catchup: False + default_view: graph + + tasks: + - operator: "BashOperator" + description: "Fetch data gcs - gcs" + args: + task_id: "bash_gcs_to_gcs" + bash_command: "gsutil cp -R gs://open-buildings-data/v1/polygons_s2_level_4_gzip gs://us-central1-dev-v2-cd7f5f38-bucket/data/open_buildings/source_files/" + - operator: "BashOperator" + description: "Unzip data" + args: + task_id: "batch1_bash_gunzip" + bash_command: + "gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/025_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/04f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/05b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/093_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/095_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0c3_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0c5_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0c7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0d1_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0d7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0d9_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0db_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0dd_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0df_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e1_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e3_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e5_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0e9_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0eb_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0ed_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0ef_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0f1_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0f9_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0fb_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0fd_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/0ff_buildings.csv.gz ; " + - operator: "BashOperator" + description: "Unzip data" + args: + task_id: "batch2_bash_gunzip" + bash_command: + "gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/103_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/105_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/107_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/109_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/10b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/10d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/10f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/111_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/113_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/117_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/119_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/11b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/11d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/11f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/121_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/123_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/125_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/127_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/129_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/12f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/131_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/137_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/139_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/13b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/13d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/141_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/143_buildings.csv.gz ; " + - operator: "BashOperator" + description: "Unzip data" + args: + task_id: "batch3_bash_gunzip" + bash_command: + "gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/145_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/147_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/149_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/14f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/15b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/15d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/161_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/163_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/165_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/167_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/169_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/16b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/16d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/16f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/171_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/173_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/175_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/177_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/179_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/17b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/17d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/17f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/181_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/183_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/185_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/189_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/18b_buildings.csv.gz ; " + - operator: "BashOperator" + description: "Unzip data" + args: + task_id: "batch4_bash_gunzip" + bash_command: + "gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/18d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/18f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/191_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/193_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/195_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/197_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/199_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/19b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/19d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/19f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a1_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a3_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a5_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1a9_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1b9_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1bb_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1bd_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1bf_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c1_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c3_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c5_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1c7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1dd_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1e5_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1e7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1e9_buildings.csv.gz ; " + - operator: "BashOperator" + description: "Unzip data" + args: + task_id: "batch5_bash_gunzip" + bash_command: + "gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1eb_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1ed_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1ef_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f1_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f3_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f5_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/1f7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/217_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/219_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/21d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/21f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/221_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/223_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/225_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/227_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/22f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/231_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/233_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/23b_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/23d_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/23f_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d1_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d5_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d7_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3d9_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/3db_buildings.csv.gz ; + gunzip -v -k /home/airflow/gcs/data/open_buildings/source_files/polygons_s2_level_4_gzip/b5b_buildings.csv.gz ; " + - operator: "KubernetesPodOperator" + description: "ETL within the kubernetes pod" + args: + task_id: "py_gcs_to_bq" + startup_timeout_seconds: 1000 + name: "load_data" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.open_buildings.container_registry.run_script_kub }}" + env_vars: + SOURCE_GCS_PATH: "{{ var.json.open_buildings.source_gcs_path }}" + PROJECT_ID: "{{ var.json.open_buildings.project_id }}" + DATASET_ID: "{{ var.json.open_buildings.dataset_id }}" + GCS_BUCKET: "{{ var.json.open_buildings.gcs_bucket }}" + SCHEMA_FILEPATH: "schema.json" + resources: + request_memory: "2G" + request_cpu: "1" + request_ephemeral_storage: "10G" + + graph_paths: + - "bash_gcs_to_gcs >> batch1_bash_gunzip >> batch2_bash_gunzip >> batch3_bash_gunzip >> batch4_bash_gunzip >> batch5_bash_gunzip >> py_gcs_to_bq"