diff --git a/datasets/idc/_images/copy_bq_datasets/script.py b/datasets/idc/_images/copy_bq_datasets/script.py index 89509167f..928ebb142 100644 --- a/datasets/idc/_images/copy_bq_datasets/script.py +++ b/datasets/idc/_images/copy_bq_datasets/script.py @@ -153,7 +153,7 @@ def trigger_config( ) -> None: now = time.time() seconds = int(now) - nanos = int((now - seconds) * 10 ** 9) + nanos = int((now - seconds) * 10**9) try: client.start_manual_transfer_runs( diff --git a/datasets/travel_sustainability/_terraform/flight_emissions_pipeline.tf b/datasets/travel_sustainability/_terraform/flight_emissions_pipeline.tf new file mode 100644 index 000000000..9e2cdf0e6 --- /dev/null +++ b/datasets/travel_sustainability/_terraform/flight_emissions_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "travel_sustainability_flight_emissions" { + project = var.project_id + dataset_id = "travel_sustainability" + table_id = "flight_emissions" + + description = "Flight emissions data" + + + + + depends_on = [ + google_bigquery_dataset.travel_sustainability + ] +} + +output "bigquery_table-travel_sustainability_flight_emissions-table_id" { + value = google_bigquery_table.travel_sustainability_flight_emissions.table_id +} + +output "bigquery_table-travel_sustainability_flight_emissions-id" { + value = google_bigquery_table.travel_sustainability_flight_emissions.id +} diff --git a/datasets/travel_sustainability/_terraform/metadata_pipeline.tf b/datasets/travel_sustainability/_terraform/metadata_pipeline.tf new file mode 100644 index 000000000..94ef68094 --- /dev/null +++ b/datasets/travel_sustainability/_terraform/metadata_pipeline.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "travel_sustainability_metadata" { + project = var.project_id + dataset_id = "travel_sustainability" + table_id = "metadata" + + description = "Metadata about the dataset" + + + + + depends_on = [ + google_bigquery_dataset.travel_sustainability + ] +} + +output "bigquery_table-travel_sustainability_metadata-table_id" { + value = google_bigquery_table.travel_sustainability_metadata.table_id +} + +output "bigquery_table-travel_sustainability_metadata-id" { + value = google_bigquery_table.travel_sustainability_metadata.id +} diff --git a/datasets/travel_sustainability/_terraform/provider.tf b/datasets/travel_sustainability/_terraform/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/travel_sustainability/_terraform/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/travel_sustainability/_terraform/travel_sustainability_dataset.tf b/datasets/travel_sustainability/_terraform/travel_sustainability_dataset.tf new file mode 100644 index 000000000..4d78dfa94 --- /dev/null +++ b/datasets/travel_sustainability/_terraform/travel_sustainability_dataset.tf @@ -0,0 +1,42 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_dataset" "travel_sustainability" { + dataset_id = "travel_sustainability" + project = var.project_id + description = "Travel Sustainability" +} + +output "bigquery_dataset-travel_sustainability-dataset_id" { + value = google_bigquery_dataset.travel_sustainability.dataset_id +} + +resource "google_storage_bucket" "travel-sustainability" { + name = "${var.bucket_name_prefix}-travel-sustainability" + force_destroy = true + location = "US" + uniform_bucket_level_access = true + lifecycle { + ignore_changes = [ + logging, + ] + } +} + +output "storage_bucket-travel-sustainability-name" { + value = google_storage_bucket.travel-sustainability.name +} diff --git a/datasets/travel_sustainability/_terraform/variables.tf b/datasets/travel_sustainability/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/travel_sustainability/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/travel_sustainability/dataset.yaml b/datasets/travel_sustainability/dataset.yaml new file mode 100644 index 000000000..f5fb8a42c --- /dev/null +++ b/datasets/travel_sustainability/dataset.yaml @@ -0,0 +1,83 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + # The `dataset` block includes properties for your dataset that will be shown + # to users of your data on the Google Cloud website. + + # Must be exactly the same name as the folder name your dataset.yaml is in. + name: travel_sustainability + + # A friendly, human-readable name of the dataset + friendly_name: ~ + + # A short, descriptive summary of the dataset. + description: ~ + + # A list of sources the dataset is derived from, using the YAML list syntax. + dataset_sources: ~ + + # A list of terms and conditions that users of the dataset should agree on, + # using the YAML list syntax. + terms_of_use: ~ +resources: + # A list of Google Cloud resources needed by your dataset. In principle, all + # pipelines under a dataset should be able to share these resources. + # + # The currently supported resources are shown below. Use only the resources + # you need, and delete the rest as needed by your pipeline. + # + # We will keep adding to the list below to support more Google Cloud resources + # over time. If a resource you need isn't supported, please file an issue on + # the repository. + + - type: bigquery_dataset + # Google BigQuery dataset to namespace all tables managed by this folder + # + # Required Properties: + # dataset_id + # + # Optional Properties: + # friendly_name (A user-friendly name of the dataset) + # description (A user-friendly description of the dataset) + # location (The geographic location where the dataset should reside) + dataset_id: travel_sustainability + description: "Travel Sustainability" + + - type: storage_bucket + # Google Cloud Storage Bucket that your pipelines need. Say, you need an + # intermediate bucket to store data in-flight. Or you need a bucket to + # archive or backup data generated by the pipelines. + # + # Because Cloud Storage bucket names must be globally unique, the `name` + # specified below will be mapped to the unique bucket name: + # + # `{bucket_name_prefix}-{name}` + # + # Where the bucket name prefix is supplied using the `--bucket-name-prefix` + # parameter when running `scripts/generate_terraform.py`. + # + # Use hyphenated syntax, e.g. `some-prefix-123`, for the names. Note that + # bucket names must not contain "google" or close misspellings, such as + # "g00gle". + # + # Required Properties: + # name + # + # Optional Properties: + # location + # uniform_bucket_level_access (we suggest False for fine-grained access) + name: travel-sustainability + location: US + uniform_bucket_level_access: true diff --git a/datasets/travel_sustainability/flight_emissions/flight_emissions_dag.py b/datasets/travel_sustainability/flight_emissions/flight_emissions_dag.py new file mode 100644 index 000000000..5decb468f --- /dev/null +++ b/datasets/travel_sustainability/flight_emissions/flight_emissions_dag.py @@ -0,0 +1,78 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-01-09", +} + + +with DAG( + dag_id="travel_sustainability.flight_emissions", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 15 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Task to load CSV data to a BigQuery table + flight_emissions_gcs_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="flight_emissions_gcs_to_bq", + bucket="{{ var.json.travel_sustainability.source_bucket }}", + source_objects=["flight_emissions.csv"], + source_format="CSV", + destination_project_dataset_table="travel_sustainability.flight_emissions", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "origin", + "type": "STRING", + "mode": "REQUIRED", + "description": "IATA code for origin airport", + }, + { + "name": "destination", + "type": "STRING", + "mode": "REQUIRED", + "description": "IATA code for destination airport", + }, + { + "name": "aircraft_model", + "type": "STRING", + "mode": "REQUIRED", + "description": "IATA code for aircraft model", + }, + { + "name": "co2e_total_grams", + "type": "INTEGER", + "mode": "REQUIRED", + "description": "Total grams of CO2e estimated for the flight including non-CO2 effects", + }, + { + "name": "co2_total_grams", + "type": "INTEGER", + "mode": "REQUIRED", + "description": "Total grams of CO2 estimated for the flight", + }, + ], + ) + + flight_emissions_gcs_to_bq diff --git a/datasets/travel_sustainability/flight_emissions/pipeline.yaml b/datasets/travel_sustainability/flight_emissions/pipeline.yaml new file mode 100644 index 000000000..7ed1a4a9c --- /dev/null +++ b/datasets/travel_sustainability/flight_emissions/pipeline.yaml @@ -0,0 +1,97 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +resources: + # A list of GCP resources that are unique and specific to your pipeline. + # + # The currently supported resources are shown below. Use only the resources + # needed by your pipeline, and delete the rest of the examples. + # + # We will keep adding to the list below to support more Google Cloud resources + # over time. If a resource you need isn't supported, please file an issue on + # the repository. + - type: bigquery_table + # A Google BigQuery table to store your data. Requires a `bigquery_dataset` + # to be specified in the config (i.e. `dataset.yaml) for the dataset that + # this pipeline belongs in. + table_id: flight_emissions + description: "Flight emissions data" + +dag: + # [Required] Specify the Airflow version of the operators used by the DAG. + airflow_version: 2 + + initialize: + dag_id: flight_emissions + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-01-09' + max_active_runs: 1 + schedule_interval: "0 15 * * *" # Daily at 3pm UTC + catchup: False + default_view: graph + + tasks: + - operator: "GoogleCloudStorageToBigQueryOperator" + # Initializes GCS to BQ task for the DAG. This operator is used to load a + # JSON, CSV, Avro, ORC, or Parquet data from GCS into a BigQuery table. + + # Task description + description: "Task to load CSV data to a BigQuery table" + + # Arguments supported by this operator: + # http://airflow.apache.org/docs/apache-airflow/stable/howto/operator/gcp/gcs.html#googlecloudstoragetobigqueryoperator + args: + task_id: "flight_emissions_gcs_to_bq" + + # The GCS bucket where the CSV file is located in. + bucket: "{{ var.json.travel_sustainability.source_bucket }}" + + # Use the CSV file containing data from today + source_objects: ["flight_emissions.csv"] + source_format: "CSV" + destination_project_dataset_table: "travel_sustainability.flight_emissions" + + # Use this if your CSV file contains a header row + skip_leading_rows: 1 + + # How to write data to the table: overwrite, append, or write if empty + # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition + write_disposition: "WRITE_TRUNCATE" + + schema_fields: + - name: "origin" + type: "STRING" + mode: "REQUIRED" + description: "IATA code for origin airport" + - name: "destination" + type: "STRING" + mode: "REQUIRED" + description: "IATA code for destination airport" + - name: "aircraft_model" + type: "STRING" + mode: "REQUIRED" + description: "IATA code for aircraft model" + - name: "co2e_total_grams" + type: "INTEGER" + mode: "REQUIRED" + description: "Total grams of CO2e estimated for the flight including non-CO2 effects" + - name: "co2_total_grams" + type: "INTEGER" + mode: "REQUIRED" + description: "Total grams of CO2 estimated for the flight" + + graph_paths: + - "flight_emissions_gcs_to_bq" diff --git a/datasets/travel_sustainability/metadata/metadata_dag.py b/datasets/travel_sustainability/metadata/metadata_dag.py new file mode 100644 index 000000000..d440a4868 --- /dev/null +++ b/datasets/travel_sustainability/metadata/metadata_dag.py @@ -0,0 +1,60 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.google.cloud.transfers import gcs_to_bigquery + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-01-09", +} + + +with DAG( + dag_id="travel_sustainability.metadata", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 15 * * *", + catchup=False, + default_view="graph", +) as dag: + + # Task to load CSV data to a BigQuery table + metadata_gcs_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="metadata_gcs_to_bq", + bucket="{{ var.json.travel_sustainability.source_bucket }}", + source_objects=["metadata.csv"], + source_format="CSV", + destination_project_dataset_table="travel_sustainability.metadata", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "key", + "type": "STRING", + "mode": "REQUIRED", + "description": "Key of the entry", + }, + { + "name": "value", + "type": "STRING", + "mode": "REQUIRED", + "description": "Value of the entry", + }, + ], + ) + + metadata_gcs_to_bq diff --git a/datasets/travel_sustainability/metadata/pipeline.yaml b/datasets/travel_sustainability/metadata/pipeline.yaml new file mode 100644 index 000000000..88f89879e --- /dev/null +++ b/datasets/travel_sustainability/metadata/pipeline.yaml @@ -0,0 +1,85 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +resources: + # A list of GCP resources that are unique and specific to your pipeline. + # + # The currently supported resources are shown below. Use only the resources + # needed by your pipeline, and delete the rest of the examples. + # + # We will keep adding to the list below to support more Google Cloud resources + # over time. If a resource you need isn't supported, please file an issue on + # the repository. + - type: bigquery_table + # A Google BigQuery table to store your data. Requires a `bigquery_dataset` + # to be specified in the config (i.e. `dataset.yaml) for the dataset that + # this pipeline belongs in. + table_id: metadata + description: "Metadata about the dataset" + +dag: + # [Required] Specify the Airflow version of the operators used by the DAG. + airflow_version: 2 + + initialize: + dag_id: metadata + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-01-09' + max_active_runs: 1 + schedule_interval: "0 15 * * *" # Daily at 3pm UTC + catchup: False + default_view: graph + + tasks: + - operator: "GoogleCloudStorageToBigQueryOperator" + # Initializes GCS to BQ task for the DAG. This operator is used to load a + # JSON, CSV, Avro, ORC, or Parquet data from GCS into a BigQuery table. + + # Task description + description: "Task to load CSV data to a BigQuery table" + + # Arguments supported by this operator: + # http://airflow.apache.org/docs/apache-airflow/stable/howto/operator/gcp/gcs.html#googlecloudstoragetobigqueryoperator + args: + task_id: "metadata_gcs_to_bq" + + # The GCS bucket where the CSV file is located in. + bucket: "{{ var.json.travel_sustainability.source_bucket }}" + + # Use the CSV file containing data from today + source_objects: ["metadata.csv"] + source_format: "CSV" + destination_project_dataset_table: "travel_sustainability.metadata" + + # Use this if your CSV file contains a header row + skip_leading_rows: 1 + + # How to write data to the table: overwrite, append, or write if empty + # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition + write_disposition: "WRITE_TRUNCATE" + + schema_fields: + - name: "key" + type: "STRING" + mode: "REQUIRED" + description: "Key of the entry" + - name: "value" + type: "STRING" + mode: "REQUIRED" + description: "Value of the entry" + + graph_paths: + - "metadata_gcs_to_bq"