diff --git a/datasets/gnomad/_terraform/provider.tf b/datasets/gnomad/_terraform/provider.tf new file mode 100644 index 000000000..82ef60b77 --- /dev/null +++ b/datasets/gnomad/_terraform/provider.tf @@ -0,0 +1,27 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/gnomad/_terraform/variables.tf b/datasets/gnomad/_terraform/variables.tf new file mode 100644 index 000000000..c3ec7c506 --- /dev/null +++ b/datasets/gnomad/_terraform/variables.tf @@ -0,0 +1,23 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} + diff --git a/datasets/gnomad/copy_gcs_bucket/copy_gcs_bucket_dag.py b/datasets/gnomad/copy_gcs_bucket/copy_gcs_bucket_dag.py new file mode 100644 index 000000000..f25d860f1 --- /dev/null +++ b/datasets/gnomad/copy_gcs_bucket/copy_gcs_bucket_dag.py @@ -0,0 +1,50 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.google.cloud.operators import cloud_storage_transfer_service + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="gnomad.copy_gcs_bucket", + default_args=default_args, + max_active_runs=1, + schedule_interval="@once", + catchup=False, + default_view="graph", +) as dag: + + # Task to run a GCS to GCS operation using Google resources + gnomad_gcs_bucket_transfer = ( + cloud_storage_transfer_service.CloudDataTransferServiceGCSToGCSOperator( + task_id="gnomad_gcs_bucket_transfer", + timeout=43200, + retries=0, + wait=True, + project_id="bigquery-public-data", + source_bucket="gnomad-public-requester-pays", + destination_bucket="gcp-public-data--gnomad", + google_impersonation_chain="{{ var.json.gnomad.service_account }}", + transfer_options={"deleteObjectsUniqueInSink": False}, + ) + ) + + gnomad_gcs_bucket_transfer diff --git a/datasets/gnomad/copy_gcs_bucket/pipeline.yaml b/datasets/gnomad/copy_gcs_bucket/pipeline.yaml new file mode 100644 index 000000000..b4dac3777 --- /dev/null +++ b/datasets/gnomad/copy_gcs_bucket/pipeline.yaml @@ -0,0 +1,62 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +--- +resources: ~ + +dag: + # [Required] Specify the Airflow version of the operators used by the DAG. + airflow_version: 2 + + # The DAG acronym stands for directed acyclic graph. This block represents + # your data pipeline along with every property and configuration it needs to + # onboard your data. + initialize: + dag_id: copy_gcs_bucket + default_args: + owner: "Google" + + # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "@once" + catchup: False + default_view: graph + + + tasks: + - operator: "CloudDataTransferServiceGCSToGCSOperator" + description: "Task to run a GCS to GCS operation using Google resources" + args: + task_id: "gnomad_gcs_bucket_transfer" + timeout: 43200 # 12 hours + retries: 0 + wait: True + project_id: bigquery-public-data + source_bucket: "gnomad-public-requester-pays" + destination_bucket: "gcp-public-data--gnomad" + google_impersonation_chain: "{{ var.json.gnomad.service_account }}" + transfer_options: + deleteObjectsUniqueInSink: False + + graph_paths: + # This is where you specify the relationships (i.e. directed paths/edges) + # among the tasks specified above. Use the bitshift operator to define the + # relationships and the `task_id` value above to represent tasks. + # + # For more info, see + # https://airflow.apache.org/docs/apache-airflow/stable/tutorial.html#setting-up-dependencies + - "gnomad_gcs_bucket_transfer" diff --git a/datasets/gnomad/dataset.yaml b/datasets/gnomad/dataset.yaml new file mode 100644 index 000000000..2b3c21413 --- /dev/null +++ b/datasets/gnomad/dataset.yaml @@ -0,0 +1,36 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + # The `dataset` block includes properties for your dataset that will be shown + # to users of your data on the Google Cloud website. + + # Must be exactly the same name as the folder name your dataset.yaml is in. + name: gnomad + + # A friendly, human-readable name of the dataset + friendly_name: ~ + + # A short, descriptive summary of the dataset. + description: ~ + + # A list of sources the dataset is derived from, using the YAML list syntax. + dataset_sources: ~ + + # A list of terms and conditions that users of the dataset should agree on, + # using the YAML list syntax. + terms_of_use: ~ + + +resources: ~