Skip to content

Commit

Permalink
feat: Onboard Travel Sustainability dataset (#280)
Browse files Browse the repository at this point in the history
  • Loading branch information
noman-aalian committed Feb 4, 2022
1 parent 7227d42 commit 8e9731a
Show file tree
Hide file tree
Showing 11 changed files with 575 additions and 1 deletion.
2 changes: 1 addition & 1 deletion datasets/idc/_images/copy_bq_datasets/script.py
Expand Up @@ -153,7 +153,7 @@ def trigger_config(
) -> None:
now = time.time()
seconds = int(now)
nanos = int((now - seconds) * 10 ** 9)
nanos = int((now - seconds) * 10**9)

try:
client.start_manual_transfer_runs(
Expand Down
@@ -0,0 +1,39 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_table" "travel_sustainability_flight_emissions" {
project = var.project_id
dataset_id = "travel_sustainability"
table_id = "flight_emissions"

description = "Flight emissions data"




depends_on = [
google_bigquery_dataset.travel_sustainability
]
}

output "bigquery_table-travel_sustainability_flight_emissions-table_id" {
value = google_bigquery_table.travel_sustainability_flight_emissions.table_id
}

output "bigquery_table-travel_sustainability_flight_emissions-id" {
value = google_bigquery_table.travel_sustainability_flight_emissions.id
}
39 changes: 39 additions & 0 deletions datasets/travel_sustainability/_terraform/metadata_pipeline.tf
@@ -0,0 +1,39 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_table" "travel_sustainability_metadata" {
project = var.project_id
dataset_id = "travel_sustainability"
table_id = "metadata"

description = "Metadata about the dataset"




depends_on = [
google_bigquery_dataset.travel_sustainability
]
}

output "bigquery_table-travel_sustainability_metadata-table_id" {
value = google_bigquery_table.travel_sustainability_metadata.table_id
}

output "bigquery_table-travel_sustainability_metadata-id" {
value = google_bigquery_table.travel_sustainability_metadata.id
}
28 changes: 28 additions & 0 deletions datasets/travel_sustainability/_terraform/provider.tf
@@ -0,0 +1,28 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


provider "google" {
project = var.project_id
impersonate_service_account = var.impersonating_acct
region = var.region
}

data "google_client_openid_userinfo" "me" {}

output "impersonating-account" {
value = data.google_client_openid_userinfo.me.email
}
@@ -0,0 +1,42 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_dataset" "travel_sustainability" {
dataset_id = "travel_sustainability"
project = var.project_id
description = "Travel Sustainability"
}

output "bigquery_dataset-travel_sustainability-dataset_id" {
value = google_bigquery_dataset.travel_sustainability.dataset_id
}

resource "google_storage_bucket" "travel-sustainability" {
name = "${var.bucket_name_prefix}-travel-sustainability"
force_destroy = true
location = "US"
uniform_bucket_level_access = true
lifecycle {
ignore_changes = [
logging,
]
}
}

output "storage_bucket-travel-sustainability-name" {
value = google_storage_bucket.travel-sustainability.name
}
23 changes: 23 additions & 0 deletions datasets/travel_sustainability/_terraform/variables.tf
@@ -0,0 +1,23 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


variable "project_id" {}
variable "bucket_name_prefix" {}
variable "impersonating_acct" {}
variable "region" {}
variable "env" {}

83 changes: 83 additions & 0 deletions datasets/travel_sustainability/dataset.yaml
@@ -0,0 +1,83 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

dataset:
# The `dataset` block includes properties for your dataset that will be shown
# to users of your data on the Google Cloud website.

# Must be exactly the same name as the folder name your dataset.yaml is in.
name: travel_sustainability

# A friendly, human-readable name of the dataset
friendly_name: ~

# A short, descriptive summary of the dataset.
description: ~

# A list of sources the dataset is derived from, using the YAML list syntax.
dataset_sources: ~

# A list of terms and conditions that users of the dataset should agree on,
# using the YAML list syntax.
terms_of_use: ~
resources:
# A list of Google Cloud resources needed by your dataset. In principle, all
# pipelines under a dataset should be able to share these resources.
#
# The currently supported resources are shown below. Use only the resources
# you need, and delete the rest as needed by your pipeline.
#
# We will keep adding to the list below to support more Google Cloud resources
# over time. If a resource you need isn't supported, please file an issue on
# the repository.

- type: bigquery_dataset
# Google BigQuery dataset to namespace all tables managed by this folder
#
# Required Properties:
# dataset_id
#
# Optional Properties:
# friendly_name (A user-friendly name of the dataset)
# description (A user-friendly description of the dataset)
# location (The geographic location where the dataset should reside)
dataset_id: travel_sustainability
description: "Travel Sustainability"

- type: storage_bucket
# Google Cloud Storage Bucket that your pipelines need. Say, you need an
# intermediate bucket to store data in-flight. Or you need a bucket to
# archive or backup data generated by the pipelines.
#
# Because Cloud Storage bucket names must be globally unique, the `name`
# specified below will be mapped to the unique bucket name:
#
# `{bucket_name_prefix}-{name}`
#
# Where the bucket name prefix is supplied using the `--bucket-name-prefix`
# parameter when running `scripts/generate_terraform.py`.
#
# Use hyphenated syntax, e.g. `some-prefix-123`, for the names. Note that
# bucket names must not contain "google" or close misspellings, such as
# "g00gle".
#
# Required Properties:
# name
#
# Optional Properties:
# location
# uniform_bucket_level_access (we suggest False for fine-grained access)
name: travel-sustainability
location: US
uniform_bucket_level_access: true
@@ -0,0 +1,78 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from airflow import DAG
from airflow.providers.google.cloud.transfers import gcs_to_bigquery

default_args = {
"owner": "Google",
"depends_on_past": False,
"start_date": "2021-01-09",
}


with DAG(
dag_id="travel_sustainability.flight_emissions",
default_args=default_args,
max_active_runs=1,
schedule_interval="0 15 * * *",
catchup=False,
default_view="graph",
) as dag:

# Task to load CSV data to a BigQuery table
flight_emissions_gcs_to_bq = gcs_to_bigquery.GCSToBigQueryOperator(
task_id="flight_emissions_gcs_to_bq",
bucket="{{ var.json.travel_sustainability.source_bucket }}",
source_objects=["flight_emissions.csv"],
source_format="CSV",
destination_project_dataset_table="travel_sustainability.flight_emissions",
skip_leading_rows=1,
write_disposition="WRITE_TRUNCATE",
schema_fields=[
{
"name": "origin",
"type": "STRING",
"mode": "REQUIRED",
"description": "IATA code for origin airport",
},
{
"name": "destination",
"type": "STRING",
"mode": "REQUIRED",
"description": "IATA code for destination airport",
},
{
"name": "aircraft_model",
"type": "STRING",
"mode": "REQUIRED",
"description": "IATA code for aircraft model",
},
{
"name": "co2e_total_grams",
"type": "INTEGER",
"mode": "REQUIRED",
"description": "Total grams of CO2e estimated for the flight including non-CO2 effects",
},
{
"name": "co2_total_grams",
"type": "INTEGER",
"mode": "REQUIRED",
"description": "Total grams of CO2 estimated for the flight",
},
],
)

flight_emissions_gcs_to_bq

0 comments on commit 8e9731a

Please sign in to comment.