diff --git a/tools/cloud_functions/bq_table_snapshots/README.md b/tools/cloud_functions/bq_table_snapshots/README.md index e4bd548bf..c44462499 100644 --- a/tools/cloud_functions/bq_table_snapshots/README.md +++ b/tools/cloud_functions/bq_table_snapshots/README.md @@ -45,4 +45,39 @@ The following environemnt variables must be set: If DATASET_1 has 500 tables, 500 Pub/Sub messages are sent, and 500 Cloud Function invocations are performed. If the Cloud Function used the current time when it creates the snapshots then these 500 snapshots will represent different points in time. To avoid this the Cloud Function will create the snapshots for the table as they were when the Cloud Scheduler job (bq-snap-start-process) was triggered. To achieve this the Cloud Function will calculate the previous interval based on **crontab_format**. +# Deployment +## Declare Variables + +``` +PROJECT_ID=`gcloud config list --format "value(core.project)" 2>/dev/null` +STORAGE_PROJECT_ID=`gcloud config list --format "value(core.project)" 2>/dev/null` +SOURCE_DATASET_NAME="DATASET_1" +TARGET_DATASET_NAME="SNAPSHOT_DATASET_1" +CRONTAB_FORMAT="10 * * * *" +SECONDS_BEFORE_EXPIRATION=604800 +``` + +* `PROJECT_ID` is the project where processing will happen, where the resurces will be hosted (e.g. Pub / Sub topics, Cloud Functions). +* `STORAGE_PROJECT_ID` is the project where BigQuery tables are stored. + +**Note**: in this case `PROJECT_ID` and `STORAGE_PROJECT_ID` are the same but that is not necesarily the case. + + +## Terraform Provisioning +``` +git clone https://github.com/GoogleCloudPlatform/bigquery-utils.git +cd ./bigquery-utils/tools/cloud_functions/bq_table_snapshots/terraform +terraform init +``` + +``` +terraform apply \ + -var="project_id=${PROJECT_ID}" \ + -var="storage_project_id=${STORAGE_PROJECT_ID}" \ + -var="source_dataset_name=${SOURCE_DATASET_NAME}" \ + -var="target_dataset_name=${TARGET_DATASET_NAME}" \ + -var="crontab_format=${CRONTAB_FORMAT}" \ + -var="seconds_before_expiration=${SECONDS_BEFORE_EXPIRATION}" \ + --auto-approve +``` diff --git a/tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots.py b/tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots/main.py similarity index 94% rename from tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots.py rename to tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots/main.py index d174fa08d..212b968ca 100644 --- a/tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots.py +++ b/tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots/main.py @@ -62,6 +62,7 @@ def create_snapshot(message): logging.info(f"Creating snapshot for table: {snapshot_name}") return job + def get_bq_client(): client_info = http_client_info.ClientInfo(user_agent=f"google-pso-tool/bq-snapshots/0.0.1") client = bigquery.Client(project=BQ_JOBS_PROJECT_ID, client_info=client_info) @@ -77,7 +78,7 @@ def main(event, context): { "source_dataset_name":"DATASET_1", "target_dataset_name":"SNAPSHOT_DATASET_1", - "crontab_format":"0 * * * *", + "crontab_format":"10 * * * *", "seconds_before_expiration":604800, "table_name": "project.dataset.table" } @@ -96,3 +97,11 @@ def main(event, context): else: return 'ok' time.sleep(2) + + +message = {"crontab_format":"0 1 * * *"} +timestamps = [] +for i in range(3): + timestamps.append(get_snapshot_timestamp) + time.sleep(1) +print(timestamps) \ No newline at end of file diff --git a/tools/cloud_functions/bq_table_snapshots/requirements.txt b/tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots/requirements.txt similarity index 100% rename from tools/cloud_functions/bq_table_snapshots/requirements.txt rename to tools/cloud_functions/bq_table_snapshots/bq_backup_create_snapshots/requirements.txt diff --git a/tools/cloud_functions/bq_table_snapshots/bq_backup_fetch_tables_names.py b/tools/cloud_functions/bq_table_snapshots/bq_backup_fetch_tables_names/main.py similarity index 100% rename from tools/cloud_functions/bq_table_snapshots/bq_backup_fetch_tables_names.py rename to tools/cloud_functions/bq_table_snapshots/bq_backup_fetch_tables_names/main.py diff --git a/tools/cloud_functions/bq_table_snapshots/bq_backup_fetch_tables_names/requirements.txt b/tools/cloud_functions/bq_table_snapshots/bq_backup_fetch_tables_names/requirements.txt new file mode 100644 index 000000000..1e885d1a4 --- /dev/null +++ b/tools/cloud_functions/bq_table_snapshots/bq_backup_fetch_tables_names/requirements.txt @@ -0,0 +1,4 @@ +google-cloud-pubsub==2.12.1 +google-cloud-bigquery==2.34.0 +google-api-core==2.10.1 +cronsim==2.1 \ No newline at end of file diff --git a/tools/cloud_functions/bq_table_snapshots/terraform/backend.tf b/tools/cloud_functions/bq_table_snapshots/terraform/backend.tf new file mode 100644 index 000000000..49d39d1ba --- /dev/null +++ b/tools/cloud_functions/bq_table_snapshots/terraform/backend.tf @@ -0,0 +1,17 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + backend "local" {} +} \ No newline at end of file diff --git a/tools/cloud_functions/bq_table_snapshots/terraform/function.tf b/tools/cloud_functions/bq_table_snapshots/terraform/function.tf new file mode 100644 index 000000000..18b6f9dee --- /dev/null +++ b/tools/cloud_functions/bq_table_snapshots/terraform/function.tf @@ -0,0 +1,141 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = ">= 4.34.0" + } + } +} + +resource "random_id" "bucket_prefix" { + byte_length = 8 +} + +########################################## +# BQ Target Dataset # +########################################## +resource "google_bigquery_dataset" "dataset" { + project = var.storage_project_id + dataset_id = var.target_dataset_name +} + +########################################## +# GCS Bucket for CF code # +########################################## +resource "google_storage_bucket" "bucket" { + name = "${random_id.bucket_prefix.hex}-gcf-source" + location = "US" + uniform_bucket_level_access = true +} + +########################################## +# Pub/Sub Topics # +########################################## +resource "google_pubsub_topic" "snapshot_dataset_topic" { + name = "snapshot_dataset_topic" +} + +resource "google_pubsub_topic" "bq_snapshot_create_snapshot_topic" { + name = "bq_snapshot_create_snapshot_topic" +} + +########################################## +# Cloud Scheduler # +########################################## +resource "google_cloud_scheduler_job" "job" { + name = "bq-snap-start-process" + schedule = var.crontab_format + + pubsub_target { + # topic.id is the topic's full resource name. + topic_name = google_pubsub_topic.snapshot_dataset_topic.id + data = base64encode("{\"source_dataset_name\":\"${var.source_dataset_name}\",\"target_dataset_name\":\"${var.target_dataset_name}\",\"crontab_format\":\"${var.crontab_format}\",\"seconds_before_expiration\":${var.seconds_before_expiration},\"tables_to_include_list\":${var.tables_to_include_list},\"tables_to_exclude_list\":${var.tables_to_exclude_list}}") + } +} + +########################################## +# bq_backup_fetch_tables_names CF # +########################################## +data "archive_file" "bq_backup_fetch_tables_names" { + type = "zip" + source_dir = "../bq_backup_fetch_tables_names" + output_path = "/tmp/bq_backup_fetch_tables_names.zip" +} + +resource "google_storage_bucket_object" "bq_backup_fetch_tables_names" { + name = "bq_backup_fetch_tables_names.zip" + bucket = google_storage_bucket.bucket.name + source = data.archive_file.bq_backup_fetch_tables_names.output_path +} + +resource "google_cloudfunctions_function" "bq_backup_fetch_tables_names" { + name = "bq_backup_fetch_tables_names" + + runtime = "python39" + available_memory_mb = 128 + entry_point = "main" + source_archive_bucket = google_storage_bucket.bucket.name + source_archive_object = google_storage_bucket_object.bq_backup_fetch_tables_names.name + + environment_variables = { + DATA_PROJECT_ID = var.storage_project_id + PUBSUB_PROJECT_ID = var.project_id + TABLE_NAME_PUBSUB_TOPIC_ID = google_pubsub_topic.bq_snapshot_create_snapshot_topic.name + } + + event_trigger { + event_type = "providers/cloud.pubsub/eventTypes/topic.publish" + resource = google_pubsub_topic.snapshot_dataset_topic.id + } +} + +########################################## +# bq_backup_create_snapshots CF # +########################################## +data "archive_file" "bq_backup_create_snapshots" { + type = "zip" + source_dir = "../bq_backup_create_snapshots" + output_path = "/tmp/bq_backup_create_snapshots.zip" +} + +resource "google_storage_bucket_object" "bq_backup_create_snapshots" { + name = "bq_backup_create_snapshots.zip" + bucket = google_storage_bucket.bucket.name + source = data.archive_file.bq_backup_create_snapshots.output_path +} + +resource "google_cloudfunctions_function" "bq_backup_create_snapshots" { + name = "bq_backup_create_snapshots" + + runtime = "python39" + max_instances = 100 # BQ allows a max of 100 concurrent snapshot jobs per project + available_memory_mb = 128 + entry_point = "main" + source_archive_bucket = google_storage_bucket.bucket.name + source_archive_object = google_storage_bucket_object.bq_backup_create_snapshots.name + + environment_variables = { + BQ_DATA_PROJECT_ID = var.storage_project_id + BQ_JOBS_PROJECT_ID = var.project_id + } + + event_trigger { + event_type = "providers/cloud.pubsub/eventTypes/topic.publish" + resource = google_pubsub_topic.bq_snapshot_create_snapshot_topic.id + } +} + diff --git a/tools/cloud_functions/bq_table_snapshots/terraform/main.tf b/tools/cloud_functions/bq_table_snapshots/terraform/main.tf new file mode 100644 index 000000000..2335fd300 --- /dev/null +++ b/tools/cloud_functions/bq_table_snapshots/terraform/main.tf @@ -0,0 +1,18 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +provider "google" { + project = var.project_id + region = var.region +} \ No newline at end of file diff --git a/tools/cloud_functions/bq_table_snapshots/terraform/variables.tf b/tools/cloud_functions/bq_table_snapshots/terraform/variables.tf new file mode 100644 index 000000000..39804a813 --- /dev/null +++ b/tools/cloud_functions/bq_table_snapshots/terraform/variables.tf @@ -0,0 +1,58 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "GCP Project ID containing Cloud Functions and Pub/Sub Topics" + type = string +} + +variable "storage_project_id" { + description = "GCP Project ID containing BigQuery tables" + type = string +} + +variable "region" { + description = "GCP region in which to deploy cloud function" + default = "us-central1" +} + +variable "source_dataset_name" { + description = "Dataset for which snapshots will be created" + type = string +} + +variable "target_dataset_name" { + description = "Dataset where the snapshots will be written to" + type = string +} + +variable "crontab_format" { + description = "Crontab schedule under which the solution will be executed" + type = string +} + +variable "seconds_before_expiration" { + description = "Seconds before the snapshot will expire" + type = number +} + +variable "tables_to_include_list" { + type = string + default = "[]" +} + +variable "tables_to_exclude_list" { + type = string + default = "[]" +} \ No newline at end of file diff --git a/tools/cloud_functions/bq_table_snapshots/tests/bq_backup_create_snapshots/test_bq_backup_create_snapshots.py b/tools/cloud_functions/bq_table_snapshots/tests/bq_backup_create_snapshots/test_bq_backup_create_snapshots.py new file mode 100644 index 000000000..50a5532b0 --- /dev/null +++ b/tools/cloud_functions/bq_table_snapshots/tests/bq_backup_create_snapshots/test_bq_backup_create_snapshots.py @@ -0,0 +1,41 @@ +# dataset/table/_SUCCESS +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""unit tests for bq_backup_create_snapshots""" + +from requests import request +from bq_backup_create_snapshots.main import get_snapshot_timestamp +import pytest +import time +from google.cloud import bigquery + + +@pytest.mark.parametrize( + "crontab_format", + [ + ("10 * * * *"), + ("30 * * * *"), + ("0 1 * * *") + ]) +def test_filter_tables(crontab_format): + """ensures that snapshots messages recieved within seconds of + eachother result in snapshots representing the same point in + time + """ + message = {"crontab_format":crontab_format} + timestamps = [] + for i in range(3): + timestamps.append(get_snapshot_timestamp) + time.sleep(1) + assert len(set(timestamps)) == 1 \ No newline at end of file diff --git a/tools/cloud_functions/bq_table_snapshots/tests/bq_backup_fetch_tables_names/test_bq_backup_fetch_tables_names.py b/tools/cloud_functions/bq_table_snapshots/tests/bq_backup_fetch_tables_names/test_bq_backup_fetch_tables_names.py new file mode 100644 index 000000000..a011efeab --- /dev/null +++ b/tools/cloud_functions/bq_table_snapshots/tests/bq_backup_fetch_tables_names/test_bq_backup_fetch_tables_names.py @@ -0,0 +1,48 @@ +# dataset/table/_SUCCESS +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""unit tests for bq_backup_fetch_table_names""" + +from requests import request +from bq_backup_fetch_tables_names.main import filter_tables, TABLE_TYPE_PHYSICAL_TABLE +import pytest +from google.cloud import bigquery + + +@pytest.mark.parametrize( + "tables_to_include,tables_to_exclude,expected", + [ + ([],[],['table1', 'table2', 'table3']), + (['table1'],[],['table1']), + ([],['table1'],['table2', 'table3']), + (['table1'],['table2'],['table1']) + ]) +def test_filter_tables(tables_to_include, tables_to_exclude, expected): + """ensure table filters are working properly + """ + tables = [build_bigquery_table_ref(x) for x in ['table1', 'table2', 'table3']] + request_json = { + "tables_to_include_list": tables_to_include, + "tables_to_exclude_list": tables_to_exclude + } + tables = filter_tables(tables, request_json) + expected = [f"project1.dataset1.{x}" for x in expected] + + assert tables == expected + + +def build_bigquery_table_ref(table_name): + table_ref = bigquery.Table(f'project1.dataset1.{table_name}') + table_ref._properties['type'] = TABLE_TYPE_PHYSICAL_TABLE + return table_ref