From 214a56ff6a60c09eb2cdbd344acbfc06cc6db822 Mon Sep 17 00:00:00 2001 From: Nicholas Large <84149918+nlarge-google@users.noreply.github.com> Date: Fri, 8 Apr 2022 15:12:54 -0500 Subject: [PATCH] Feat: Onboard EPA Historical Air Quality dataset (#301) --- .../infra/annual_summaries_pipeline.tf | 39 - .../infra/co_daily_summary_pipeline.tf | 39 - .../infra/co_hourly_summary_pipeline.tf | 39 - .../epa_historical_air_quality_pipeline.tf | 574 ++++++ .../infra/hap_daily_summary_pipeline.tf | 39 - .../infra/hap_hourly_summary_pipeline.tf | 39 - .../infra/lead_daily_summary_pipeline.tf | 39 - .../infra/no2_daily_summary_pipeline.tf | 39 - .../infra/no2_hourly_summary_pipeline.tf | 39 - .../infra/nonoxnoy_daily_summary_pipeline.tf | 39 - .../infra/nonoxnoy_hourly_summary_pipeline.tf | 39 - .../infra/ozone_daily_summary_pipeline.tf | 39 - .../infra/ozone_hourly_summary_pipeline.tf | 39 - .../infra/pm10_daily_summary_pipeline.tf | 39 - .../infra/pm10_hourly_summary_pipeline.tf | 39 - .../infra/pm25_frm_hourly_summary_pipeline.tf | 39 - .../pm25_nonfrm_daily_summary_pipeline.tf | 39 - .../pm25_nonfrm_hourly_summary_pipeline.tf | 39 - .../pm25_speciation_daily_summary_pipeline.tf | 39 - ...pm25_speciation_hourly_summary_pipeline.tf | 39 - .../infra/pressure_daily_summary_pipeline.tf | 39 - .../infra/pressure_hourly_summary_pipeline.tf | 39 - .../infra/rh_and_dp_daily_summary_pipeline.tf | 39 - .../rh_and_dp_hourly_summary_pipeline.tf | 39 - .../infra/so2_daily_summary_pipeline.tf | 39 - .../infra/so2_hourly_summary_pipeline.tf | 39 - .../temperature_daily_summary_pipeline.tf | 39 - .../temperature_hourly_summary_pipeline.tf | 39 - .../infra/voc_daily_summary_pipeline.tf | 39 - .../infra/voc_hourly_summary_pipeline.tf | 39 - .../infra/wind_daily_summary_pipeline.tf | 39 - .../infra/wind_hourly_summary_pipeline.tf | 39 - .../run_csv_transform_kub/csv_transform.py | 586 ++++-- .../epa_annual_summaries_schema.json | 332 ++++ .../epa_co_daily_summary_schema.json | 176 ++ .../epa_co_hourly_summary_schema.json | 146 ++ .../epa_hap_daily_summary_schema.json | 176 ++ .../epa_hap_hourly_summary_schema.json | 146 ++ .../epa_lead_daily_summary_schema.json | 176 ++ .../epa_no2_daily_summary_schema.json | 176 ++ .../epa_no2_hourly_summary_schema.json | 146 ++ .../epa_nonoxnoy_daily_summary_schema.json | 176 ++ .../epa_nonoxnoy_hourly_summary_schema.json | 146 ++ .../epa_ozone_daily_summary_schema.json | 176 ++ .../epa_ozone_hourly_summary_schema.json | 146 ++ .../epa_pm10_daily_summary_schema.json | 176 ++ .../epa_pm10_hourly_summary_schema.json | 146 ++ .../epa_pm25_frm_hourly_summary_schema.json | 146 ++ .../epa_pm25_nonfrm_daily_summary_schema.json | 176 ++ ...epa_pm25_nonfrm_hourly_summary_schema.json | 146 ++ ..._pm25_speciation_daily_summary_schema.json | 176 ++ ...pm25_speciation_hourly_summary_schema.json | 146 ++ .../epa_pressure_daily_summary_schema.json | 176 ++ .../epa_pressure_hourly_summary_schema.json | 146 ++ .../epa_rh_and_dp_daily_summary_schema.json | 176 ++ .../epa_rh_and_dp_hourly_summary_schema.json | 146 ++ .../epa_so2_daily_summary_schema.json | 176 ++ .../epa_so2_hourly_summary_schema.json | 146 ++ .../epa_temperature_daily_summary_schema.json | 176 ++ ...epa_temperature_hourly_summary_schema.json | 146 ++ .../epa_voc_daily_summary_schema.json | 176 ++ .../epa_voc_hourly_summary_schema.json | 146 ++ .../epa_wind_daily_summary_schema.json | 176 ++ .../epa_wind_hourly_summary_schema.json | 146 ++ .../run_csv_transform_kub/requirements.txt | 5 +- .../annual_summaries/annual_summaries_dag.py | 408 ---- .../pipelines/annual_summaries/pipeline.yaml | 321 ---- .../co_daily_summary/co_daily_summary_dag.py | 252 --- .../pipelines/co_daily_summary/pipeline.yaml | 207 -- .../co_hourly_summary_dag.py | 222 --- .../pipelines/co_hourly_summary/pipeline.yaml | 185 -- .../epa_historical_air_quality_dag.py | 1086 +++++++++++ .../epa_historical_air_quality/pipeline.yaml | 1699 +++++++++++++++++ .../hap_daily_summary_dag.py | 252 --- .../pipelines/hap_daily_summary/pipeline.yaml | 207 -- .../hap_hourly_summary_dag.py | 222 --- .../hap_hourly_summary/pipeline.yaml | 185 -- .../lead_daily_summary_dag.py | 248 --- .../lead_daily_summary/pipeline.yaml | 206 -- .../no2_daily_summary_dag.py | 252 --- .../pipelines/no2_daily_summary/pipeline.yaml | 207 -- .../no2_hourly_summary_dag.py | 222 --- .../no2_hourly_summary/pipeline.yaml | 185 -- .../nonoxnoy_daily_summary_dag.py | 252 --- .../nonoxnoy_daily_summary/pipeline.yaml | 207 -- .../nonoxnoy_hourly_summary_dag.py | 222 --- .../nonoxnoy_hourly_summary/pipeline.yaml | 185 -- .../ozone_daily_summary_dag.py | 252 --- .../ozone_daily_summary/pipeline.yaml | 207 -- .../ozone_hourly_summary_dag.py | 222 --- .../ozone_hourly_summary/pipeline.yaml | 185 -- .../pm10_daily_summary/pipeline.yaml | 207 -- .../pm10_daily_summary_dag.py | 252 --- .../pm10_hourly_summary/pipeline.yaml | 185 -- .../pm10_hourly_summary_dag.py | 222 --- .../pm25_frm_hourly_summary/pipeline.yaml | 185 -- .../pm25_frm_hourly_summary_dag.py | 222 --- .../pm25_nonfrm_daily_summary/pipeline.yaml | 207 -- .../pm25_nonfrm_daily_summary_dag.py | 252 --- .../pm25_nonfrm_hourly_summary/pipeline.yaml | 185 -- .../pm25_nonfrm_hourly_summary_dag.py | 222 --- .../pipeline.yaml | 207 -- .../pm25_speciation_daily_summary_dag.py | 252 --- .../pipeline.yaml | 185 -- .../pm25_speciation_hourly_summary_dag.py | 222 --- .../pressure_daily_summary/pipeline.yaml | 207 -- .../pressure_daily_summary_dag.py | 252 --- .../pressure_hourly_summary/pipeline.yaml | 185 -- .../pressure_hourly_summary_dag.py | 222 --- .../rh_and_dp_daily_summary/pipeline.yaml | 207 -- .../rh_and_dp_daily_summary_dag.py | 252 --- .../rh_and_dp_hourly_summary/pipeline.yaml | 185 -- .../rh_and_dp_hourly_summary_dag.py | 222 --- .../pipelines/so2_daily_summary/pipeline.yaml | 207 -- .../so2_daily_summary_dag.py | 252 --- .../so2_hourly_summary/pipeline.yaml | 185 -- .../so2_hourly_summary_dag.py | 222 --- .../temperature_daily_summary/pipeline.yaml | 207 -- .../temperature_daily_summary_dag.py | 252 --- .../temperature_hourly_summary/pipeline.yaml | 185 -- .../temperature_hourly_summary_dag.py | 222 --- .../pipelines/voc_daily_summary/pipeline.yaml | 207 -- .../voc_daily_summary_dag.py | 252 --- .../voc_hourly_summary/pipeline.yaml | 185 -- .../voc_hourly_summary_dag.py | 222 --- .../wind_daily_summary/pipeline.yaml | 207 -- .../wind_daily_summary_dag.py | 252 --- .../wind_hourly_summary/pipeline.yaml | 185 -- .../wind_hourly_summary_dag.py | 222 --- 129 files changed, 8979 insertions(+), 15056 deletions(-) delete mode 100644 datasets/epa_historical_air_quality/infra/annual_summaries_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/co_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/co_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/infra/epa_historical_air_quality_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/hap_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/hap_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/lead_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/no2_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/no2_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/nonoxnoy_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/nonoxnoy_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/ozone_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/ozone_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pm10_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pm10_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pm25_frm_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pm25_nonfrm_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pm25_nonfrm_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pm25_speciation_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pm25_speciation_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pressure_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/pressure_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/rh_and_dp_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/rh_and_dp_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/so2_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/so2_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/temperature_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/temperature_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/voc_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/voc_hourly_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/wind_daily_summary_pipeline.tf delete mode 100644 datasets/epa_historical_air_quality/infra/wind_hourly_summary_pipeline.tf create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_annual_summaries_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_lead_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_frm_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_hourly_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_daily_summary_schema.json create mode 100644 datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_hourly_summary_schema.json delete mode 100644 datasets/epa_historical_air_quality/pipelines/annual_summaries/annual_summaries_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/annual_summaries/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/co_daily_summary/co_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/co_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/co_hourly_summary/co_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/co_hourly_summary/pipeline.yaml create mode 100644 datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/epa_historical_air_quality_dag.py create mode 100644 datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/hap_daily_summary/hap_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/hap_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/hap_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/lead_daily_summary/lead_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/lead_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/no2_daily_summary/no2_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/no2_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/no2_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/ozone_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/ozone_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pm10_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pm10_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pressure_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pressure_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/so2_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/so2_daily_summary/so2_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/so2_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/temperature_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/temperature_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/voc_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/voc_daily_summary/voc_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/voc_hourly_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/wind_daily_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/wind_daily_summary/wind_daily_summary_dag.py delete mode 100644 datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/pipeline.yaml delete mode 100644 datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/wind_hourly_summary_dag.py diff --git a/datasets/epa_historical_air_quality/infra/annual_summaries_pipeline.tf b/datasets/epa_historical_air_quality/infra/annual_summaries_pipeline.tf deleted file mode 100644 index 7084028c2..000000000 --- a/datasets/epa_historical_air_quality/infra/annual_summaries_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_annual_summaries" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "annual_summaries" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_annual_summaries-table_id" { - value = google_bigquery_table.epa_historical_air_quality_annual_summaries.table_id -} - -output "bigquery_table-epa_historical_air_quality_annual_summaries-id" { - value = google_bigquery_table.epa_historical_air_quality_annual_summaries.id -} diff --git a/datasets/epa_historical_air_quality/infra/co_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/co_daily_summary_pipeline.tf deleted file mode 100644 index 4b475afed..000000000 --- a/datasets/epa_historical_air_quality/infra/co_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_co_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "co_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_co_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_co_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/co_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/co_hourly_summary_pipeline.tf deleted file mode 100644 index 96131d79d..000000000 --- a/datasets/epa_historical_air_quality/infra/co_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_co_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "co_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_co_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_co_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/epa_historical_air_quality_pipeline.tf b/datasets/epa_historical_air_quality/infra/epa_historical_air_quality_pipeline.tf new file mode 100644 index 000000000..0bf83c0ee --- /dev/null +++ b/datasets/epa_historical_air_quality/infra/epa_historical_air_quality_pipeline.tf @@ -0,0 +1,574 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "epa_historical_air_quality_annual_summaries" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "annual_summaries" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_annual_summaries-table_id" { + value = google_bigquery_table.epa_historical_air_quality_annual_summaries.table_id +} + +output "bigquery_table-epa_historical_air_quality_annual_summaries-id" { + value = google_bigquery_table.epa_historical_air_quality_annual_summaries.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_co_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "co_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_co_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_co_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_co_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_co_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "co_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_co_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_co_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_co_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_hap_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "hap_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_hap_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_hap_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_hap_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "hap_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_lead_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "lead_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_lead_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_lead_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_no2_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "no2_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_no2_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_no2_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_no2_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "no2_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "nonoxnoy_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "nonoxnoy_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_ozone_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "ozone_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_ozone_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "ozone_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pm10_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm10_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pm10_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm10_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_frm_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_frm_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_nonfrm_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_nonfrm_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_speciation_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pm25_speciation_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pressure_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pressure_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_pressure_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "pressure_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "rh_and_dp_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "rh_and_dp_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_so2_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "so2_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_so2_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_so2_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_so2_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "so2_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_temperature_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "temperature_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_temperature_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "temperature_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_voc_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "voc_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_voc_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_voc_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_voc_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "voc_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_wind_daily_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "wind_daily_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_wind_daily_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_wind_daily_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.id +} + +resource "google_bigquery_table" "epa_historical_air_quality_wind_hourly_summary" { + project = var.project_id + dataset_id = "epa_historical_air_quality" + table_id = "wind_hourly_summary" + description = "epaspc" + depends_on = [ + google_bigquery_dataset.epa_historical_air_quality + ] +} + +output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-table_id" { + value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.table_id +} + +output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-id" { + value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.id +} diff --git a/datasets/epa_historical_air_quality/infra/hap_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/hap_daily_summary_pipeline.tf deleted file mode 100644 index b8aac1e45..000000000 --- a/datasets/epa_historical_air_quality/infra/hap_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_hap_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "hap_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_hap_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_hap_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_hap_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/hap_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/hap_hourly_summary_pipeline.tf deleted file mode 100644 index dd7896ae1..000000000 --- a/datasets/epa_historical_air_quality/infra/hap_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_hap_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "hap_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_hap_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_hap_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/lead_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/lead_daily_summary_pipeline.tf deleted file mode 100644 index d7fd58f20..000000000 --- a/datasets/epa_historical_air_quality/infra/lead_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_lead_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "lead_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_lead_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_lead_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_lead_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/no2_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/no2_daily_summary_pipeline.tf deleted file mode 100644 index 448a029f8..000000000 --- a/datasets/epa_historical_air_quality/infra/no2_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_no2_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "no2_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_no2_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_no2_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_no2_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/no2_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/no2_hourly_summary_pipeline.tf deleted file mode 100644 index 2d057b403..000000000 --- a/datasets/epa_historical_air_quality/infra/no2_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_no2_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "no2_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_no2_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_no2_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/nonoxnoy_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/nonoxnoy_daily_summary_pipeline.tf deleted file mode 100644 index 92f5294c7..000000000 --- a/datasets/epa_historical_air_quality/infra/nonoxnoy_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "nonoxnoy_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_nonoxnoy_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/nonoxnoy_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/nonoxnoy_hourly_summary_pipeline.tf deleted file mode 100644 index 4b57e8fba..000000000 --- a/datasets/epa_historical_air_quality/infra/nonoxnoy_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_nonoxnoy_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "nonoxnoy_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_nonoxnoy_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_nonoxnoy_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/ozone_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/ozone_daily_summary_pipeline.tf deleted file mode 100644 index 19cff7cc2..000000000 --- a/datasets/epa_historical_air_quality/infra/ozone_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_ozone_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "ozone_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_ozone_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_ozone_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/ozone_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/ozone_hourly_summary_pipeline.tf deleted file mode 100644 index 517e8127c..000000000 --- a/datasets/epa_historical_air_quality/infra/ozone_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_ozone_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "ozone_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_ozone_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_ozone_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pm10_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pm10_daily_summary_pipeline.tf deleted file mode 100644 index af38e7681..000000000 --- a/datasets/epa_historical_air_quality/infra/pm10_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pm10_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pm10_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pm10_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pm10_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pm10_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pm10_hourly_summary_pipeline.tf deleted file mode 100644 index d83d38c9e..000000000 --- a/datasets/epa_historical_air_quality/infra/pm10_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pm10_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pm10_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pm10_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pm10_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pm25_frm_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pm25_frm_hourly_summary_pipeline.tf deleted file mode 100644 index 3d64246b4..000000000 --- a/datasets/epa_historical_air_quality/infra/pm25_frm_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pm25_frm_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pm25_frm_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pm25_frm_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_frm_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pm25_nonfrm_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pm25_nonfrm_daily_summary_pipeline.tf deleted file mode 100644 index 5faf05f88..000000000 --- a/datasets/epa_historical_air_quality/infra/pm25_nonfrm_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pm25_nonfrm_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pm25_nonfrm_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pm25_nonfrm_hourly_summary_pipeline.tf deleted file mode 100644 index 8cb22a6ac..000000000 --- a/datasets/epa_historical_air_quality/infra/pm25_nonfrm_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pm25_nonfrm_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pm25_nonfrm_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pm25_nonfrm_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_nonfrm_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pm25_speciation_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pm25_speciation_daily_summary_pipeline.tf deleted file mode 100644 index c4ce35a13..000000000 --- a/datasets/epa_historical_air_quality/infra/pm25_speciation_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pm25_speciation_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pm25_speciation_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pm25_speciation_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pm25_speciation_hourly_summary_pipeline.tf deleted file mode 100644 index aa0da3bf7..000000000 --- a/datasets/epa_historical_air_quality/infra/pm25_speciation_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pm25_speciation_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pm25_speciation_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pm25_speciation_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pm25_speciation_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pressure_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pressure_daily_summary_pipeline.tf deleted file mode 100644 index f67bfa0eb..000000000 --- a/datasets/epa_historical_air_quality/infra/pressure_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pressure_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pressure_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pressure_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pressure_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/pressure_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/pressure_hourly_summary_pipeline.tf deleted file mode 100644 index 23fa46310..000000000 --- a/datasets/epa_historical_air_quality/infra/pressure_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_pressure_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "pressure_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_pressure_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_pressure_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/rh_and_dp_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/rh_and_dp_daily_summary_pipeline.tf deleted file mode 100644 index 7bd465c09..000000000 --- a/datasets/epa_historical_air_quality/infra/rh_and_dp_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "rh_and_dp_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_rh_and_dp_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/rh_and_dp_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/rh_and_dp_hourly_summary_pipeline.tf deleted file mode 100644 index f259b3cba..000000000 --- a/datasets/epa_historical_air_quality/infra/rh_and_dp_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_rh_and_dp_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "rh_and_dp_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_rh_and_dp_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_rh_and_dp_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/so2_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/so2_daily_summary_pipeline.tf deleted file mode 100644 index c2e5bfa02..000000000 --- a/datasets/epa_historical_air_quality/infra/so2_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_so2_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "so2_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_so2_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_so2_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_so2_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/so2_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/so2_hourly_summary_pipeline.tf deleted file mode 100644 index 5a74e4d45..000000000 --- a/datasets/epa_historical_air_quality/infra/so2_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_so2_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "so2_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_so2_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_so2_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/temperature_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/temperature_daily_summary_pipeline.tf deleted file mode 100644 index 98865c34e..000000000 --- a/datasets/epa_historical_air_quality/infra/temperature_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_temperature_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "temperature_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_temperature_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_temperature_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/temperature_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/temperature_hourly_summary_pipeline.tf deleted file mode 100644 index 09bd21923..000000000 --- a/datasets/epa_historical_air_quality/infra/temperature_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_temperature_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "temperature_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_temperature_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_temperature_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/voc_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/voc_daily_summary_pipeline.tf deleted file mode 100644 index 7348fa307..000000000 --- a/datasets/epa_historical_air_quality/infra/voc_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_voc_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "voc_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_voc_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_voc_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_voc_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/voc_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/voc_hourly_summary_pipeline.tf deleted file mode 100644 index 7a337682d..000000000 --- a/datasets/epa_historical_air_quality/infra/voc_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_voc_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "voc_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_voc_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_voc_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/wind_daily_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/wind_daily_summary_pipeline.tf deleted file mode 100644 index 90d444049..000000000 --- a/datasets/epa_historical_air_quality/infra/wind_daily_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_wind_daily_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "wind_daily_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_wind_daily_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_wind_daily_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_wind_daily_summary.id -} diff --git a/datasets/epa_historical_air_quality/infra/wind_hourly_summary_pipeline.tf b/datasets/epa_historical_air_quality/infra/wind_hourly_summary_pipeline.tf deleted file mode 100644 index 257bce937..000000000 --- a/datasets/epa_historical_air_quality/infra/wind_hourly_summary_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "epa_historical_air_quality_wind_hourly_summary" { - project = var.project_id - dataset_id = "epa_historical_air_quality" - table_id = "wind_hourly_summary" - - description = "epaspc" - - - - - depends_on = [ - google_bigquery_dataset.epa_historical_air_quality - ] -} - -output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-table_id" { - value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.table_id -} - -output "bigquery_table-epa_historical_air_quality_wind_hourly_summary-id" { - value = google_bigquery_table.epa_historical_air_quality_wind_hourly_summary.id -} diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/csv_transform.py b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/csv_transform.py index bd56ca0b6..a853b81b1 100644 --- a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/csv_transform.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,6 @@ # limitations under the License. import datetime -import fnmatch import json import logging import os @@ -23,70 +22,359 @@ import pandas as pd import requests -from google.cloud import storage +from google.cloud import bigquery, storage +from google.cloud.exceptions import NotFound def main( source_url: str, start_year: int, source_file: pathlib.Path, - target_file: pathlib.Path, + # target_file: pathlib.Path, + project_id: str, + dataset_id: str, + table_id: str, + year_field_name: str, + year_field_type: str, + schema_path: str, chunksize: str, target_gcs_bucket: str, target_gcs_path: str, - data_names: typing.List[str], + pipeline_name: str, + input_headers: typing.List[str], data_dtypes: dict, + output_headers: typing.List[str], ) -> None: - - logging.info("Pipeline process started") - + logging.info(f"{pipeline_name} process started") pathlib.Path("./files").mkdir(parents=True, exist_ok=True) dest_path = os.path.split(source_file)[0] - end_year = datetime.datetime.today().year - 2 - download_url_files_from_year_range( - source_url, start_year, end_year, dest_path, True, False - ) - st_year = datetime.datetime.today().year - 1 - end_year = datetime.datetime.today().year - download_url_files_from_year_range( - source_url, st_year, end_year, dest_path, True, True + execute_pipeline( + project_id=project_id, + dataset_id=dataset_id, + table_name=table_id, + year_field_name=year_field_name, + year_field_type=year_field_type, + start_year=start_year, + source_url=source_url, + dest_path=dest_path, + schema_path=schema_path, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + input_headers=input_headers, + output_headers=output_headers, + data_dtypes=data_dtypes, + chunksize=chunksize, + field_delimiter="|", ) - file_group_wildcard = os.path.split(source_url)[1].replace("_YEAR_ITERATOR.zip", "") - source = concatenate_files(source_file, dest_path, file_group_wildcard, False, ",") + logging.info(f"{pipeline_name} process completed") - process_source_file(source, target_file, data_names, data_dtypes, int(chunksize)) - upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) - - logging.info("Pipeline process completed") +def execute_pipeline( + project_id: str, + dataset_id: str, + table_name: str, + year_field_name: str, + year_field_type: str, + start_year: str, + source_url: str, + dest_path: str, + schema_path: str, + target_gcs_bucket: str, + target_gcs_path: str, + input_headers: typing.List[str], + output_headers: typing.List[str], + data_dtypes: dict, + chunksize: str, + field_delimiter: str, +) -> None: + create_dest_table( + project_id=project_id, + dataset_id=dataset_id, + table_id=table_name, + schema_filepath=schema_path, + bucket_name=target_gcs_bucket, + ) + end_year = datetime.datetime.today().year - 2 + for yr in range(start_year, end_year + 1, 1): + process_year_data( + project_id=project_id, + dataset_id=dataset_id, + table_name=table_name, + year_field_name=year_field_name, + year_field_type=year_field_type, + year=yr, + continue_on_error=False, + source_url=source_url, + dest_path=dest_path, + input_headers=input_headers, + output_headers=output_headers, + data_dtypes=data_dtypes, + chunksize=chunksize, + field_delimiter=field_delimiter, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + ) + st_year = datetime.datetime.today().year - 1 + end_year = datetime.datetime.today().year + for yr in range(st_year, end_year + 1, 1): + process_year_data( + project_id=project_id, + dataset_id=dataset_id, + table_name=table_name, + year_field_name=year_field_name, + year_field_type=year_field_type, + year=yr, + continue_on_error=True, + source_url=source_url, + dest_path=dest_path, + input_headers=input_headers, + output_headers=output_headers, + data_dtypes=data_dtypes, + chunksize=chunksize, + field_delimiter=field_delimiter, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + ) -def download_url_files_from_year_range( +def process_year_data( + project_id: str, + dataset_id: str, + table_name: str, + year_field_name: str, + year_field_type: str, + year: str, + continue_on_error: bool, source_url: str, - start_year: int, - end_year: int, dest_path: str, - remove_file: bool = False, - continue_on_error: bool = False, + input_headers: typing.List[str], + output_headers: typing.List[str], + data_dtypes: dict, + chunksize: str, + field_delimiter: str, + target_gcs_bucket: str, + target_gcs_path: str, + remove_file: bool = True, ): - for yr in range(start_year, end_year + 1, 1): - src_url = source_url.replace("YEAR_ITERATOR", str(yr)) - dest_file = dest_path + "/source_" + os.path.split(src_url)[1] - download_file_http(src_url, dest_file, continue_on_error) - unpack_file(dest_file, dest_path, "zip") - if remove_file: - os.remove(dest_file) + logging.info(f"Processing year {year} data.") + table_has_data = table_has_year_data( + project_id, dataset_id, table_name, year_field_name, year_field_type, year + ) + if table_has_data or table_has_data is None: + pass + else: + src_url = source_url.replace("YEAR_ITERATOR", str(year)) + url_file = os.path.split(src_url)[1] + url_file_csv = url_file.replace(".zip", ".csv") + source_file = f"{dest_path}/source_{url_file}" + source_csv_file = f"{dest_path}/{url_file_csv}" + target_file = f"{dest_path}/target_{url_file_csv}" + file_exists = download_file_http( + source_url=src_url, + source_file=source_file, + continue_on_error=continue_on_error, + ) + if file_exists: + unpack_file(infile=source_file, dest_path=dest_path, compression_type="zip") + process_source_file( + source_file=source_file, + target_file=target_file, + input_headers=input_headers, + output_headers=output_headers, + dtypes=data_dtypes, + chunksize=chunksize, + field_delimiter=field_delimiter, + ) + load_data_to_bq( + project_id=project_id, + dataset_id=dataset_id, + table_id=table_name, + file_path=target_file, + field_delimiter=field_delimiter, + truncate_table=False, + ) + if os.path.exists(target_file): + upload_file_to_gcs( + file_path=target_file, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + ) + if remove_file: + os.remove(source_file) + os.remove(source_csv_file) + os.remove(target_file) + else: + pass + else: + pass + logging.info(f"Processing year {year} data completed.") + + +def table_has_year_data( + project_id: str, + dataset_id: str, + table_name: str, + year_field_name: str, + year_field_type: str, + year: str, +) -> bool: + number_rows = number_rows_in_table( + project_id, dataset_id, table_name, year_field_name, year_field_type, year + ) + if number_rows > 0: + return True + elif number_rows == -1: + return None + else: + return False + + +def table_exists(project_id: str, dataset_id: str, table_name: str) -> bool: + client = bigquery.Client(project=project_id) + tables = client.list_tables(dataset_id) + found_table = False + for tbl in tables: + if tbl.table_id == table_name: + found_table = True + return found_table + + +def field_exists( + project_id: str, dataset_id: str, table_name: str, field_name: str +) -> bool: + if table_exists(project_id, dataset_id, table_name): + client = bigquery.Client(project=project_id) + table_ref = f"{dataset_id}.{table_name}" + tbl_schema = client.get_table(table_ref).schema + found_field = False + for field in tbl_schema: + if field.name == field_name: + found_field = True + return found_field + else: + return False + + +def number_rows_in_table( + project_id: str, + dataset_id: str, + table_name: str, + year_field_name: str, + year_field_type: str, + year: str, +) -> int: + check_field_exists = field_exists( + project_id, dataset_id, table_name, year_field_name + ) + if check_field_exists: + client = bigquery.Client(project=project_id) + query = f""" + SELECT count(1) AS number_of_rows + FROM {dataset_id}.{table_name} + WHERE + """ + if year_field_type == "DATETIME": + query = query + f" FORMAT_DATE('%Y', {year_field_name}) = '{year}'" + else: + query = query + f" {year_field_name} = {year}" + job_config = bigquery.QueryJobConfig() + query_job = client.query(query, job_config=job_config) + for row in query_job.result(): + count_rows = row.number_of_rows + return int(count_rows) + else: + return -1 + + +def process_source_file( + source_file: str, + target_file: str, + input_headers: typing.List[str], + output_headers: typing.List[str], + dtypes: dict, + chunksize: str, + field_delimiter: str, +) -> None: + logging.info(f"Opening batch file {source_file}") + with pd.read_csv( + source_file, # path to main source file to load in batches + engine="python", + encoding="utf-8", + quotechar='"', # string separator, typically double-quotes + chunksize=int(chunksize), # size of batch data, in no. of records + sep=",", # data column separator, typically "," + header=0, # use when the data file does not contain a header + names=input_headers, + dtype=dtypes, + keep_default_na=True, + na_values=[" "], + ) as reader: + for chunk_number, chunk in enumerate(reader): + target_file_batch = str(target_file).replace( + ".csv", "-" + str(chunk_number) + ".csv" + ) + df = pd.DataFrame() + df = pd.concat([df, chunk]) + process_chunk( + df=df, + target_file_batch=target_file_batch, + target_file=target_file, + include_header=(chunk_number == 0), + truncate_file=(chunk_number == 0), + field_delimiter=field_delimiter, + output_headers=output_headers, + ) + + +def load_data_to_bq( + project_id: str, + dataset_id: str, + table_id: str, + file_path: str, + field_delimiter: str, + truncate_table: bool, +) -> None: + logging.info( + f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} delim={field_delimiter} started" + ) + client = bigquery.Client(project=project_id) + table_ref = client.dataset(dataset_id).table(table_id) + job_config = bigquery.LoadJobConfig() + job_config.source_format = bigquery.SourceFormat.CSV + job_config.field_delimiter = field_delimiter + if truncate_table: + job_config.write_disposition = "WRITE_TRUNCATE" + else: + job_config.write_disposition = "WRITE_APPEND" + job_config.skip_leading_rows = 1 + job_config.autodetect = False + with open(file_path, "rb") as source_file: + job = client.load_table_from_file( + file_obj=source_file, destination=table_ref, job_config=job_config + ) + job.result() + logging.info( + f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} completed" + ) def download_file_http( source_url: str, source_file: pathlib.Path, continue_on_error: bool = False -) -> None: +) -> bool: logging.info(f"Downloading {source_url} to {source_file}") try: src_file = requests.get(source_url, stream=True) - with open(source_file, "wb") as f: - for chunk in src_file: - f.write(chunk) + rtn_status_code = src_file.status_code + if 400 <= rtn_status_code <= 499: + logging.info( + f"Unable to download file {source_url} (error code was {rtn_status_code})" + ) + return False + else: + with open(source_file, "wb") as f: + for chunk in src_file: + f.write(chunk) + return True except requests.exceptions.RequestException as e: if e == requests.exceptions.HTTPError: err_msg = "A HTTP error occurred." @@ -98,9 +386,8 @@ def download_file_http( logging.info(f"{err_msg} Unable to obtain {source_url}") raise SystemExit(e) else: - logging.info( - f"{err_msg} Unable to obtain {source_url}. Continuing execution." - ) + logging.info(f"{err_msg} Unable to obtain {source_url}.") + return False def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> None: @@ -118,100 +405,114 @@ def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> N logging.info(f"{infile} not unpacked because it does not exist.") -def zip_decompress(infile: str, dest_path: str) -> None: - logging.info(f"Unpacking {infile} to {dest_path}") - with zip.ZipFile(infile, mode="r") as zipf: - zipf.extractall(dest_path) - zipf.close() - - -def concatenate_files( - target_file_path: str, - dest_path: str, - file_group_wildcard: str, - incl_file_source_path: bool = False, - separator: str = ",", - delete_src_file: bool = True, -) -> str: - target_file_dir = os.path.split(str(target_file_path))[0] - target_file_path = str(target_file_path).replace( - ".csv", "_" + file_group_wildcard + ".csv" - ) - logging.info(f"Concatenating files {target_file_dir}/*{file_group_wildcard}") - if os.path.isfile(target_file_path): - os.unlink(target_file_path) - for src_file_path in sorted( - fnmatch.filter(os.listdir(dest_path), "*" + file_group_wildcard + "*") - ): - src_file_path = dest_path + "/" + src_file_path - with open(src_file_path, "r") as src_file: - with open(target_file_path, "a+") as target_file: - next(src_file) +def create_dest_table( + project_id: str, + dataset_id: str, + table_id: str, + schema_filepath: list, + bucket_name: str, +) -> bool: + table_ref = f"{project_id}.{dataset_id}.{table_id}" + logging.info(f"Attempting to create table {table_ref} if it doesn't already exist") + client = bigquery.Client() + table_exists = False + try: + table_exists_id = client.get_table(table_ref).table_id + logging.info(f"Table {table_exists_id} currently exists.") + table_exists = True + except NotFound: + logging.info( + ( + f"Table {table_ref} currently does not exist. Attempting to create table." + ) + ) + try: + if check_gcs_file_exists(schema_filepath, bucket_name): + schema = create_table_schema([], bucket_name, schema_filepath) + table = bigquery.Table(table_ref, schema=schema) + client.create_table(table) + print(f"Table {table_ref} was created".format(table_id)) + table_exists = True + else: + file_name = os.path.split(schema_filepath)[1] + file_path = os.path.split(schema_filepath)[0] logging.info( - f"Reading from file {src_file_path}, writing to file {target_file_path}" + f"Error: Unable to create table {table_ref} because schema file {file_name} does not exist in location {file_path} in bucket {bucket_name}" ) - for line in src_file: - if incl_file_source_path: - line = ( - '"' - + os.path.split(src_file_path)[1].strip() - + '"' - + separator - + line - ) # include the file source - else: - line = line - target_file.write(line) - if os.path.isfile(src_file_path) and delete_src_file: - os.unlink(src_file_path) - - return target_file_path + table_exists = False + except Exception as e: + logging.info(f"Unable to create table. {e}") + table_exists = False + return table_exists -def process_source_file( - source_file: str, target_file: str, names: list, dtypes: dict, chunksize: int -) -> None: - logging.info(f"Opening batch file {source_file}") - with pd.read_csv( - source_file, # path to main source file to load in batches - engine="python", - encoding="utf-8", - quotechar='"', # string separator, typically double-quotes - chunksize=chunksize, # size of batch data, in no. of records - sep=",", # data column separator, typically "," - header=None, # use when the data file does not contain a header - names=names, - dtype=dtypes, - keep_default_na=True, - na_values=[" "], - ) as reader: - for chunk_number, chunk in enumerate(reader): - target_file_batch = str(target_file).replace( - ".csv", "-" + str(chunk_number) + ".csv" +def check_gcs_file_exists(file_path: str, bucket_name: str) -> bool: + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + exists = storage.Blob(bucket=bucket, name=file_path).exists(storage_client) + return exists + + +def create_table_schema( + schema_structure: list, bucket_name: str = "", schema_filepath: str = "" +) -> list: + logging.info(f"Defining table schema... {bucket_name} ... {schema_filepath}") + schema = [] + if not (schema_filepath): + schema_struct = schema_structure + else: + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(schema_filepath) + schema_struct = json.loads(blob.download_as_string(client=None)) + for schema_field in schema_struct: + fld_name = schema_field["name"] + fld_type = schema_field["type"] + try: + fld_descr = schema_field["description"] + except KeyError: + fld_descr = "" + fld_mode = schema_field["mode"] + schema.append( + bigquery.SchemaField( + name=fld_name, field_type=fld_type, mode=fld_mode, description=fld_descr ) - df = pd.DataFrame() - df = pd.concat([df, chunk]) - process_chunk(df, target_file_batch, target_file, (not chunk_number == 0)) + ) + return schema def process_chunk( df: pd.DataFrame, target_file_batch: str, target_file: str, - skip_header: bool, + include_header: bool, + truncate_file: bool, + field_delimiter: str, + output_headers: typing.List[str], ) -> None: df = resolve_date_format(df, "%Y-%m-%d %H:%M") - save_to_new_file(df, file_path=str(target_file_batch), sep=",") - append_batch_file(target_file_batch, target_file, skip_header, not (skip_header)) + df = reorder_headers(df, output_headers) + save_to_new_file(df=df, file_path=str(target_file_batch), sep=field_delimiter) + append_batch_file( + batch_file_path=target_file_batch, + target_file_path=target_file, + include_header=include_header, + truncate_target_file=truncate_file, + ) + logging.info(f"Processing Batch {target_file_batch} completed") + + +def reorder_headers(df: pd.DataFrame, output_headers: typing.List[str]) -> pd.DataFrame: + logging.info("Reordering headers..") + df = df[output_headers] + return df def resolve_date_format(df: pd.DataFrame, from_format: str) -> pd.DataFrame: - logging.info("Resolving Date Format") for col in df.columns: if df[col].dtype == "datetime64[ns]": logging.info(f"Resolving datetime on {col}") df[col] = df[col].apply(lambda x: convert_dt_format(str(x), from_format)) - return df @@ -236,42 +537,58 @@ def convert_dt_format(dt_str: str, from_format: str) -> str: from_format = "%Y-%m-%d " + from_format.strip().split(" ")[1] else: dt_str = "" - return rtnval -def save_to_new_file(df, file_path, sep="|") -> None: +def save_to_new_file(df: pd.DataFrame, file_path: str, sep: str = "|") -> None: logging.info(f"Saving to file {file_path} separator='{sep}'") df.to_csv(file_path, sep=sep, index=False) def append_batch_file( - batch_file_path: str, target_file_path: str, skip_header: bool, truncate_file: bool + batch_file_path: str, + target_file_path: str, + include_header: bool, + truncate_target_file: bool, ) -> None: + logging.info( + f"Appending file {batch_file_path} to file {target_file_path} with include_header={include_header} and truncate_target_file={truncate_target_file}" + ) with open(batch_file_path, "r") as data_file: - if truncate_file: + if truncate_target_file: target_file = open(target_file_path, "w+").close() with open(target_file_path, "a+") as target_file: - if skip_header: + if not include_header: logging.info( - f"Appending batch file {batch_file_path} to {target_file_path} with skip header" + f"Appending batch file {batch_file_path} to {target_file_path} without header" ) next(data_file) else: logging.info( - f"Appending batch file {batch_file_path} to {target_file_path}" + f"Appending batch file {batch_file_path} to {target_file_path} with header" ) target_file.write(data_file.read()) + data_file.close() + target_file.close() if os.path.exists(batch_file_path): os.remove(batch_file_path) -def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: - logging.info(f"Uploading to GCS {gcs_bucket} in {gcs_path}") - storage_client = storage.Client() - bucket = storage_client.bucket(gcs_bucket) - blob = bucket.blob(gcs_path) - blob.upload_from_filename(file_path) +def upload_file_to_gcs( + file_path: pathlib.Path, target_gcs_bucket: str, target_gcs_path: str +) -> None: + if os.path.exists(file_path): + logging.info( + f"Uploading output file {file_path} to gs://{target_gcs_bucket}/{target_gcs_path}" + ) + storage_client = storage.Client() + bucket = storage_client.bucket(target_gcs_bucket) + blob = bucket.blob(target_gcs_path) + blob.upload_from_filename(file_path) + else: + logging.info( + f"Cannot upload file to gs://{target_gcs_bucket}/{target_gcs_path} as it does not exist." + ) if __name__ == "__main__": @@ -279,12 +596,19 @@ def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) main( source_url=os.environ["SOURCE_URL"], - source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), - target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(), start_year=int(os.environ["START_YEAR"]), + source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), + project_id=os.environ["PROJECT_ID"], + dataset_id=os.environ["DATASET_ID"], + table_id=os.environ["TABLE_ID"], + year_field_name=os.environ["YEAR_FIELD_NAME"], + year_field_type=os.environ["YEAR_FIELD_TYPE"], + schema_path=os.environ["SCHEMA_PATH"], chunksize=os.environ["CHUNKSIZE"], target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"], target_gcs_path=os.environ["TARGET_GCS_PATH"], - data_names=json.loads(os.environ["DATA_NAMES"]), + pipeline_name=os.environ["PIPELINE_NAME"], + input_headers=json.loads(os.environ["INPUT_CSV_HEADERS"]), data_dtypes=json.loads(os.environ["DATA_DTYPES"]), + output_headers=json.loads(os.environ["OUTPUT_CSV_HEADERS"]), ) diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_annual_summaries_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_annual_summaries_schema.json new file mode 100644 index 000000000..1122a645b --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_annual_summaries_schema.json @@ -0,0 +1,332 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the 'Parameter Occurrence Code' used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "metric_used", + "type": "STRING", + "description": "The base metric used in the calculation of the aggregate statistics presented in the remainder of the row. For example, if this is Daily Maximum, then the value in the Mean column is the mean of the daily maximums.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "year", + "type": "INTEGER", + "description": "The year the annual summary data represents.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the year.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the year. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "completeness_indicator", + "type": "STRING", + "description": "An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "valid_day_count", + "type": "INTEGER", + "description": "The number of days during the year where the daily monitoring criteria were met, if the calculation of the summaries is based on valid days.", + "mode": "NULLABLE" + }, + { + "name": "required_day_count", + "type": "INTEGER", + "description": "The number of days during the year which the monitor was scheduled to take samples if measurements are required.", + "mode": "NULLABLE" + }, + { + "name": "exceptional_data_count", + "type": "INTEGER", + "description": "The number of data points in the annual data set affected by exceptional air quality events (things outside the norm that affect air quality).", + "mode": "NULLABLE" + }, + { + "name": "null_data_count", + "type": "INTEGER", + "description": "The count of scheduled samples when no data was collected and the reason for no data was reported.", + "mode": "NULLABLE" + }, + { + "name": "primary_exceedance_count", + "type": "INTEGER", + "description": "The number of samples during the year that exceeded the primary air quality standard.", + "mode": "NULLABLE" + }, + { + "name": "secondary_exceedance_count", + "type": "INTEGER", + "description": "The number of samples during the year that exceeded the secondary air quality standard.", + "mode": "NULLABLE" + }, + { + "name": "certification_indicator", + "type": "STRING", + "description": "An indication whether the completeness and accuracy of the information on the annual summary record has been certified by the submitter. Certified means the submitter has certified the data (due May 01 the year after collection). Certification not required means that the parameter does not require certification or the deadline has not yet passed. Uncertified (past due) means that certification is required but is overdue. Requested but not yet concurred means the submitter has completed the process, but EPA has not yet acted to certify the data. Requested but denied means the submitter has completed the process, but EPA has denied the request for cause. Was certified but data changed means the data was certified but data was replaced and the process has not been repeated.", + "mode": "NULLABLE" + }, + { + "name": "num_obs_below_mdl", + "type": "INTEGER", + "description": "The number of samples reported during the year that were below the method detection limit (MDL) for the monitoring instrument. Sometimes these values are replaced by 1/2 the MDL in summary calculations.", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the year.", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_standard_dev", + "type": "FLOAT", + "description": "The standard deviation about the mean of the values for the year.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the year.", + "mode": "NULLABLE" + }, + { + "name": "first_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the highest value for the year (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "second_max_value", + "type": "FLOAT", + "description": "The second highest value for the year.", + "mode": "NULLABLE" + }, + { + "name": "second_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the second highest value for the year (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "third_max_value", + "type": "FLOAT", + "description": "The third highest value for the year.", + "mode": "NULLABLE" + }, + { + "name": "third_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the third highest value for the year (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "fourth_max_value", + "type": "FLOAT", + "description": "The fourth highest value for the year.", + "mode": "NULLABLE" + }, + { + "name": "fourth_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the fourth highest value for the year (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "first_max_non_overlapping_value", + "type": "FLOAT", + "description": "For 8-hour CO averages, the highest value of the year.", + "mode": "NULLABLE" + }, + { + "name": "first_no_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the first maximum non overlapping value for the year (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "second_max_non_overlapping_value", + "type": "FLOAT", + "description": "For 8-hour CO averages, the second highest value of the year that does not share any hours with the 8-hour period of the first max non overlapping value.", + "mode": "NULLABLE" + }, + { + "name": "second_no_max_datetime", + "type": "TIMESTAMP", + "description": "The date and time (on a 24-hour clock) when the second maximum non overlapping value for the year (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "ninety_nine_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 99 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE" + }, + { + "name": "ninety_eight_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 98 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE" + }, + { + "name": "ninety_five_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 95 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE" + }, + { + "name": "ninety_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 90 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE" + }, + { + "name": "seventy_five_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 75 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE" + }, + { + "name": "fifty_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 50 per cent of the rest of the measured values for the year are equal to or less than (i.e., the median).", + "mode": "NULLABLE" + }, + { + "name": "ten_percentile", + "type": "FLOAT", + "description": "The value from this monitor for which 10 per cent of the rest of the measured values for the year are equal to or less than.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_co_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_hap_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_lead_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_lead_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_lead_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_no2_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_nonoxnoy_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_ozone_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_daily_summary_schema.json new file mode 100644 index 000000000..e12675671 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm10_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_frm_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_frm_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_frm_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_nonfrm_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pm25_speciation_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_pressure_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_rh_and_dp_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_so2_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_temperature_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_voc_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_daily_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_daily_summary_schema.json new file mode 100644 index 000000000..26f34a30e --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_daily_summary_schema.json @@ -0,0 +1,176 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "sample_duration", + "type": "STRING", + "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", + "mode": "NULLABLE" + }, + { + "name": "pollutant_standard", + "type": "STRING", + "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "event_type", + "type": "STRING", + "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", + "mode": "NULLABLE" + }, + { + "name": "observation_count", + "type": "INTEGER", + "description": "The number of observations (samples) taken during the day.", + "mode": "NULLABLE" + }, + { + "name": "observation_percent", + "type": "FLOAT", + "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", + "mode": "NULLABLE" + }, + { + "name": "arithmetic_mean", + "type": "FLOAT", + "description": "The average (arithmetic mean) value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_value", + "type": "FLOAT", + "description": "The highest value for the day.", + "mode": "NULLABLE" + }, + { + "name": "first_max_hour", + "type": "INTEGER", + "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", + "mode": "NULLABLE" + }, + { + "name": "aqi", + "type": "INTEGER", + "description": "The Air Quality Index for the day for the pollutant, if applicable.", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "INTEGER", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "local_site_name", + "type": "STRING", + "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "STRING", + "description": "The approximate street address of the monitoring site.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "city_name", + "type": "STRING", + "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", + "mode": "NULLABLE" + }, + { + "name": "cbsa_name", + "type": "STRING", + "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_hourly_summary_schema.json b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_hourly_summary_schema.json new file mode 100644 index 000000000..1d7519a87 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/epa_wind_hourly_summary_schema.json @@ -0,0 +1,146 @@ +[ + { + "name": "state_code", + "type": "STRING", + "description": "The FIPS code of the state in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "county_code", + "type": "STRING", + "description": "The FIPS code of the county in which the monitor resides.", + "mode": "NULLABLE" + }, + { + "name": "site_num", + "type": "STRING", + "description": "A unique number within the county identifying the site.", + "mode": "NULLABLE" + }, + { + "name": "parameter_code", + "type": "INTEGER", + "description": "The AQS code corresponding to the parameter measured by the monitor.", + "mode": "NULLABLE" + }, + { + "name": "poc", + "type": "INTEGER", + "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", + "mode": "NULLABLE" + }, + { + "name": "datum", + "type": "STRING", + "description": "The Datum associated with the Latitude and Longitude measures.", + "mode": "NULLABLE" + }, + { + "name": "parameter_name", + "type": "STRING", + "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", + "mode": "NULLABLE" + }, + { + "name": "date_local", + "type": "TIMESTAMP", + "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", + "mode": "NULLABLE" + }, + { + "name": "time_local", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", + "mode": "NULLABLE" + }, + { + "name": "date_gmt", + "type": "TIMESTAMP", + "description": "The calendar date of the sample in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "time_gmt", + "type": "STRING", + "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", + "mode": "NULLABLE" + }, + { + "name": "sample_measurement", + "type": "FLOAT", + "description": "The measured value in the standard units of measure for the parameter.", + "mode": "NULLABLE" + }, + { + "name": "units_of_measure", + "type": "STRING", + "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", + "mode": "NULLABLE" + }, + { + "name": "mdl", + "type": "FLOAT", + "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", + "mode": "NULLABLE" + }, + { + "name": "uncertainty", + "type": "FLOAT", + "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", + "mode": "NULLABLE" + }, + { + "name": "qualifier", + "type": "STRING", + "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", + "mode": "NULLABLE" + }, + { + "name": "method_type", + "type": "STRING", + "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", + "mode": "NULLABLE" + }, + { + "name": "method_code", + "type": "STRING", + "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", + "mode": "NULLABLE" + }, + { + "name": "method_name", + "type": "STRING", + "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", + "mode": "NULLABLE" + }, + { + "name": "state_name", + "type": "STRING", + "description": "The name of the state where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "county_name", + "type": "STRING", + "description": "The name of the county where the monitoring site is located.", + "mode": "NULLABLE" + }, + { + "name": "date_of_last_change", + "type": "TIMESTAMP", + "description": "The date the last time any numeric values in this record were updated in the AQS data system.", + "mode": "NULLABLE" + } +] diff --git a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/requirements.txt b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/requirements.txt index f36704793..f87f393f3 100644 --- a/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/requirements.txt +++ b/datasets/epa_historical_air_quality/pipelines/_images/run_csv_transform_kub/requirements.txt @@ -1,3 +1,4 @@ -requests -pandas google-cloud-storage +google-cloud-bigquery +pandas +requests diff --git a/datasets/epa_historical_air_quality/pipelines/annual_summaries/annual_summaries_dag.py b/datasets/epa_historical_air_quality/pipelines/annual_summaries/annual_summaries_dag.py deleted file mode 100644 index 29689edc9..000000000 --- a/datasets/epa_historical_air_quality/pipelines/annual_summaries/annual_summaries_dag.py +++ /dev/null @@ -1,408 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.annual_summaries", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 0 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="annual_summaries", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_YEAR_ITERATOR.zip", - "START_YEAR": "1980", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "750000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/annual_summaries/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure",\n "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count",\n "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count",\n "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value",\n "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime",\n "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value",\n "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile",\n "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address",\n "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str",\n "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32",\n "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str",\n "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64",\n "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]",\n "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64",\n "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64",\n "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str",\n "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/annual_summaries/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.annual_summaries }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the 'Parameter Occurrence Code' used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "metric_used", - "type": "STRING", - "description": "The base metric used in the calculation of the aggregate statistics presented in the remainder of the row. For example, if this is Daily Maximum, then the value in the Mean column is the mean of the daily maximums.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "year", - "type": "INTEGER", - "description": "The year the annual summary data represents.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the year.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the year. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "completeness_indicator", - "type": "STRING", - "description": "An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "valid_day_count", - "type": "INTEGER", - "description": "The number of days during the year where the daily monitoring criteria were met, if the calculation of the summaries is based on valid days.", - "mode": "NULLABLE", - }, - { - "name": "required_day_count", - "type": "INTEGER", - "description": "The number of days during the year which the monitor was scheduled to take samples if measurements are required.", - "mode": "NULLABLE", - }, - { - "name": "exceptional_data_count", - "type": "INTEGER", - "description": "The number of data points in the annual data set affected by exceptional air quality events (things outside the norm that affect air quality).", - "mode": "NULLABLE", - }, - { - "name": "null_data_count", - "type": "INTEGER", - "description": "The count of scheduled samples when no data was collected and the reason for no data was reported.", - "mode": "NULLABLE", - }, - { - "name": "primary_exceedance_count", - "type": "INTEGER", - "description": "The number of samples during the year that exceeded the primary air quality standard.", - "mode": "NULLABLE", - }, - { - "name": "secondary_exceedance_count", - "type": "INTEGER", - "description": "The number of samples during the year that exceeded the secondary air quality standard.", - "mode": "NULLABLE", - }, - { - "name": "certification_indicator", - "type": "STRING", - "description": "An indication whether the completeness and accuracy of the information on the annual summary record has been certified by the submitter. Certified means the submitter has certified the data (due May 01 the year after collection). Certification not required means that the parameter does not require certification or the deadline has not yet passed. Uncertified (past due) means that certification is required but is overdue. Requested but not yet concurred means the submitter has completed the process, but EPA has not yet acted to certify the data. Requested but denied means the submitter has completed the process, but EPA has denied the request for cause. Was certified but data changed means the data was certified but data was replaced and the process has not been repeated.", - "mode": "NULLABLE", - }, - { - "name": "num_obs_below_mdl", - "type": "INTEGER", - "description": "The number of samples reported during the year that were below the method detection limit (MDL) for the monitoring instrument. Sometimes these values are replaced by 1/2 the MDL in summary calculations.", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the year.", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_standard_dev", - "type": "FLOAT", - "description": "The standard deviation about the mean of the values for the year.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the year.", - "mode": "NULLABLE", - }, - { - "name": "first_max_datetime", - "type": "TIMESTAMP", - "description": "The date and time (on a 24-hour clock) when the highest value for the year (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "second_max_value", - "type": "FLOAT", - "description": "The second highest value for the year.", - "mode": "NULLABLE", - }, - { - "name": "second_max_datetime", - "type": "TIMESTAMP", - "description": "The date and time (on a 24-hour clock) when the second highest value for the year (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "third_max_value", - "type": "FLOAT", - "description": "The third highest value for the year.", - "mode": "NULLABLE", - }, - { - "name": "third_max_datetime", - "type": "TIMESTAMP", - "description": "The date and time (on a 24-hour clock) when the third highest value for the year (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "fourth_max_value", - "type": "FLOAT", - "description": "The fourth highest value for the year.", - "mode": "NULLABLE", - }, - { - "name": "fourth_max_datetime", - "type": "TIMESTAMP", - "description": "The date and time (on a 24-hour clock) when the fourth highest value for the year (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "first_max_non_overlapping_value", - "type": "FLOAT", - "description": "For 8-hour CO averages, the highest value of the year.", - "mode": "NULLABLE", - }, - { - "name": "first_no_max_datetime", - "type": "TIMESTAMP", - "description": "The date and time (on a 24-hour clock) when the first maximum non overlapping value for the year (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "second_max_non_overlapping_value", - "type": "FLOAT", - "description": "For 8-hour CO averages, the second highest value of the year that does not share any hours with the 8-hour period of the first max non overlapping value.", - "mode": "NULLABLE", - }, - { - "name": "second_no_max_datetime", - "type": "TIMESTAMP", - "description": "The date and time (on a 24-hour clock) when the second maximum non overlapping value for the year (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "ninety_nine_percentile", - "type": "FLOAT", - "description": "The value from this monitor for which 99 per cent of the rest of the measured values for the year are equal to or less than.", - "mode": "NULLABLE", - }, - { - "name": "ninety_eight_percentile", - "type": "FLOAT", - "description": "The value from this monitor for which 98 per cent of the rest of the measured values for the year are equal to or less than.", - "mode": "NULLABLE", - }, - { - "name": "ninety_five_percentile", - "type": "FLOAT", - "description": "The value from this monitor for which 95 per cent of the rest of the measured values for the year are equal to or less than.", - "mode": "NULLABLE", - }, - { - "name": "ninety_percentile", - "type": "FLOAT", - "description": "The value from this monitor for which 90 per cent of the rest of the measured values for the year are equal to or less than.", - "mode": "NULLABLE", - }, - { - "name": "seventy_five_percentile", - "type": "FLOAT", - "description": "The value from this monitor for which 75 per cent of the rest of the measured values for the year are equal to or less than.", - "mode": "NULLABLE", - }, - { - "name": "fifty_percentile", - "type": "FLOAT", - "description": "The value from this monitor for which 50 per cent of the rest of the measured values for the year are equal to or less than (i.e., the median).", - "mode": "NULLABLE", - }, - { - "name": "ten_percentile", - "type": "FLOAT", - "description": "The value from this monitor for which 10 per cent of the rest of the measured values for the year are equal to or less than.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/annual_summaries/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/annual_summaries/pipeline.yaml deleted file mode 100644 index 57c25890d..000000000 --- a/datasets/epa_historical_air_quality/pipelines/annual_summaries/pipeline.yaml +++ /dev/null @@ -1,321 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "annual_summaries" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: annual_summaries - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 0 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "annual_summaries" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/annual_conc_by_monitor_YEAR_ITERATOR.zip" - START_YEAR: "1980" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "750000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/annual_summaries/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", - "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", - "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", - "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", - "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", - "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", - "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", - "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", - "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", - "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", - "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str", - "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", - "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", - "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", - "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", - "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", - "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/annual_summaries/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.annual_summaries }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the 'Parameter Occurrence Code' used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "metric_used" - "type": "STRING" - "description": "The base metric used in the calculation of the aggregate statistics presented in the remainder of the row. For example, if this is Daily Maximum, then the value in the Mean column is the mean of the daily maximums." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "year" - "type": "INTEGER" - "description": "The year the annual summary data represents." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the year." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the year. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "completeness_indicator" - "type": "STRING" - "description": "An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter." - "mode": "NULLABLE" - - "name": "valid_day_count" - "type": "INTEGER" - "description": "The number of days during the year where the daily monitoring criteria were met, if the calculation of the summaries is based on valid days." - "mode": "NULLABLE" - - "name": "required_day_count" - "type": "INTEGER" - "description": "The number of days during the year which the monitor was scheduled to take samples if measurements are required." - "mode": "NULLABLE" - - "name": "exceptional_data_count" - "type": "INTEGER" - "description": "The number of data points in the annual data set affected by exceptional air quality events (things outside the norm that affect air quality)." - "mode": "NULLABLE" - - "name": "null_data_count" - "type": "INTEGER" - "description": "The count of scheduled samples when no data was collected and the reason for no data was reported." - "mode": "NULLABLE" - - "name": "primary_exceedance_count" - "type": "INTEGER" - "description": "The number of samples during the year that exceeded the primary air quality standard." - "mode": "NULLABLE" - - "name": "secondary_exceedance_count" - "type": "INTEGER" - "description": "The number of samples during the year that exceeded the secondary air quality standard." - "mode": "NULLABLE" - - "name": "certification_indicator" - "type": "STRING" - "description": "An indication whether the completeness and accuracy of the information on the annual summary record has been certified by the submitter. Certified means the submitter has certified the data (due May 01 the year after collection). Certification not required means that the parameter does not require certification or the deadline has not yet passed. Uncertified (past due) means that certification is required but is overdue. Requested but not yet concurred means the submitter has completed the process, but EPA has not yet acted to certify the data. Requested but denied means the submitter has completed the process, but EPA has denied the request for cause. Was certified but data changed means the data was certified but data was replaced and the process has not been repeated." - "mode": "NULLABLE" - - "name": "num_obs_below_mdl" - "type": "INTEGER" - "description": "The number of samples reported during the year that were below the method detection limit (MDL) for the monitoring instrument. Sometimes these values are replaced by 1/2 the MDL in summary calculations." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the year." - "mode": "NULLABLE" - - "name": "arithmetic_standard_dev" - "type": "FLOAT" - "description": "The standard deviation about the mean of the values for the year." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the year." - "mode": "NULLABLE" - - "name": "first_max_datetime" - "type": "TIMESTAMP" - "description": "The date and time (on a 24-hour clock) when the highest value for the year (the previous field) was taken." - "mode": "NULLABLE" - - "name": "second_max_value" - "type": "FLOAT" - "description": "The second highest value for the year." - "mode": "NULLABLE" - - "name": "second_max_datetime" - "type": "TIMESTAMP" - "description": "The date and time (on a 24-hour clock) when the second highest value for the year (the previous field) was taken." - "mode": "NULLABLE" - - "name": "third_max_value" - "type": "FLOAT" - "description": "The third highest value for the year." - "mode": "NULLABLE" - - "name": "third_max_datetime" - "type": "TIMESTAMP" - "description": "The date and time (on a 24-hour clock) when the third highest value for the year (the previous field) was taken." - "mode": "NULLABLE" - - "name": "fourth_max_value" - "type": "FLOAT" - "description": "The fourth highest value for the year." - "mode": "NULLABLE" - - "name": "fourth_max_datetime" - "type": "TIMESTAMP" - "description": "The date and time (on a 24-hour clock) when the fourth highest value for the year (the previous field) was taken." - "mode": "NULLABLE" - - "name": "first_max_non_overlapping_value" - "type": "FLOAT" - "description": "For 8-hour CO averages, the highest value of the year." - "mode": "NULLABLE" - - "name": "first_no_max_datetime" - "type": "TIMESTAMP" - "description": "The date and time (on a 24-hour clock) when the first maximum non overlapping value for the year (the previous field) was taken." - "mode": "NULLABLE" - - "name": "second_max_non_overlapping_value" - "type": "FLOAT" - "description": "For 8-hour CO averages, the second highest value of the year that does not share any hours with the 8-hour period of the first max non overlapping value." - "mode": "NULLABLE" - - "name": "second_no_max_datetime" - "type": "TIMESTAMP" - "description": "The date and time (on a 24-hour clock) when the second maximum non overlapping value for the year (the previous field) was taken." - "mode": "NULLABLE" - - "name": "ninety_nine_percentile" - "type": "FLOAT" - "description": "The value from this monitor for which 99 per cent of the rest of the measured values for the year are equal to or less than." - "mode": "NULLABLE" - - "name": "ninety_eight_percentile" - "type": "FLOAT" - "description": "The value from this monitor for which 98 per cent of the rest of the measured values for the year are equal to or less than." - "mode": "NULLABLE" - - "name": "ninety_five_percentile" - "type": "FLOAT" - "description": "The value from this monitor for which 95 per cent of the rest of the measured values for the year are equal to or less than." - "mode": "NULLABLE" - - "name": "ninety_percentile" - "type": "FLOAT" - "description": "The value from this monitor for which 90 per cent of the rest of the measured values for the year are equal to or less than." - "mode": "NULLABLE" - - "name": "seventy_five_percentile" - "type": "FLOAT" - "description": "The value from this monitor for which 75 per cent of the rest of the measured values for the year are equal to or less than." - "mode": "NULLABLE" - - "name": "fifty_percentile" - "type": "FLOAT" - "description": "The value from this monitor for which 50 per cent of the rest of the measured values for the year are equal to or less than (i.e., the median)." - "mode": "NULLABLE" - - "name": "ten_percentile" - "type": "FLOAT" - "description": "The value from this monitor for which 10 per cent of the rest of the measured values for the year are equal to or less than." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/co_daily_summary/co_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/co_daily_summary/co_daily_summary_dag.py deleted file mode 100644 index d08ced89b..000000000 --- a/datasets/epa_historical_air_quality/pipelines/co_daily_summary/co_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.co_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 0 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="co_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42101_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "750000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.annual_summaries }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/co_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/co_daily_summary/pipeline.yaml deleted file mode 100644 index 0ffb8baaa..000000000 --- a/datasets/epa_historical_air_quality/pipelines/co_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "co_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: co_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 0 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "co_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42101_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "750000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/co_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/co_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.annual_summaries }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/co_hourly_summary/co_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/co_hourly_summary/co_hourly_summary_dag.py deleted file mode 100644 index dbe5211fd..000000000 --- a/datasets/epa_historical_air_quality/pipelines/co_hourly_summary/co_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.co_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 1 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="co_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code",\n "method_name", "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str",\n "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str",\n "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.co_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/co_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/co_hourly_summary/pipeline.yaml deleted file mode 100644 index eb6055040..000000000 --- a/datasets/epa_historical_air_quality/pipelines/co_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "co_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: co_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 1 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "co_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42101_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", - "method_name", "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str", - "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", - "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/co_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.co_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/epa_historical_air_quality_dag.py b/datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/epa_historical_air_quality_dag.py new file mode 100644 index 000000000..7736f0890 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/epa_historical_air_quality_dag.py @@ -0,0 +1,1086 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.google.cloud.operators import kubernetes_engine + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="epa_historical_air_quality.epa_historical_air_quality", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 1 * * 6", + catchup=False, + default_view="graph", +) as dag: + create_cluster = kubernetes_engine.GKECreateClusterOperator( + task_id="create_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + body={ + "name": "epa-hist-air-quality", + "initial_node_count": 8, + "network": "{{ var.value.vpc_network }}", + "node_config": { + "machine_type": "e2-standard-16", + "oauth_scopes": [ + "https://www.googleapis.com/auth/devstorage.read_write", + "https://www.googleapis.com/auth/cloud-platform", + ], + }, + }, + ) + + # Run CSV transform within kubernetes pod + annual_summaries = kubernetes_engine.GKEStartPodOperator( + task_id="annual_summaries", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.annual_summary.source_url }}", + "START_YEAR": "1980", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.annual_summary.table_id }}", + "YEAR_FIELD_NAME": "year", + "YEAR_FIELD_TYPE": "INT", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.annual_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.annual_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.annual_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - annual_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure",\n "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count",\n "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count",\n "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value",\n "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime",\n "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value",\n "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile",\n "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address",\n "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str",\n "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32",\n "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str",\n "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64",\n "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]",\n "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64",\n "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64",\n "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str",\n "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure",\n "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count",\n "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count",\n "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value",\n "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime",\n "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value",\n "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile",\n "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address",\n "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + co_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="co_daily_summary", + startup_timeout_seconds=600, + name="load_co_daily_summary", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.co_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.co_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.co_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.co_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.co_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - co_daily_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "str", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + co_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="co_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.co_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.co_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.co_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.co_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.co_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - co_hourly_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code",\n "method_name", "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str",\n "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str",\n "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code",\n "method_name", "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + hap_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="hap_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.hap_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.hap_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.hap_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.hap_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.hap_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - hap_daily_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + hap_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="hap_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.hap_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.hap_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.hap_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.hap_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.hap_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - hap_hourly_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + lead_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="lead_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.lead_daily_summary.source_url }}", + "START_YEAR": "1980", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.lead_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.lead_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.lead_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.lead_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - lead_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + no2_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="no2_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.no2_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.no2_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.no2_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.no2_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.no2_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - no2_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + no2_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="no2_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.no2_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.no2_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.no2_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.no2_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.no2_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - no2_hourly", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + nonoxnoy_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="nonoxnoy_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - nonoxnoy_daily", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + nonoxnoy_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="nonoxnoy_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - nonoxnoy_hourly", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + ozone_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="ozone_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.ozone_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.ozone_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.ozone_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.ozone_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.ozone_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - ozone_daily_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + ozone_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="ozone_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.source_url }}", + "START_YEAR": "1980", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - ozone_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pm10_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pm10_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pm10_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pm10_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pm10_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pm10_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pm10_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pm10_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pm10_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pm10_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pm10_hourly_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pm25_frm_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pm25_frm_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.source_url }}", + "START_YEAR": "1980", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pm25_frm_hourly_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pm25_nonfrm_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pm25_nonfrm_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.source_url }}", + "START_YEAR": "1980", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pm25_nonfrm_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pm25_nonfrm_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pm25_nonfrm_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pm25_nonfrm_hourly_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pm25_speciation_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pm25_speciation_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pm25_speciation_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pm25_speciation_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pm25_speciation_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pm25_speciation_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pressure_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pressure_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pressure_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pressure_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pressure_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pressure_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pressure_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pressure_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + pressure_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="pressure_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - pressure_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + rh_and_dp_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="rh_and_dp_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - rh_and_dp_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + rh_and_dp_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="rh_and_dp_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - rh_and_dp_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + so2_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="so2_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.so2_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.so2_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.so2_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.so2_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.so2_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - so2_daily_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + so2_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="so2_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.so2_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.so2_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.so2_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.so2_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.so2_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - so2_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + temperature_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="temperature_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.temperature_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.temperature_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.temperature_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.temperature_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.temperature_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - temperature_daily_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + temperature_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="temperature_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - temperature_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + voc_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="voc_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.voc_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.voc_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.voc_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.voc_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.voc_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - voc_daily_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + voc_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="voc_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.voc_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.voc_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.voc_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.voc_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.voc_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - voc_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + wind_daily_summary = kubernetes_engine.GKEStartPodOperator( + task_id="wind_daily_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.wind_daily_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.wind_daily_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.wind_daily_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.wind_daily_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.wind_daily_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - wind_daily_summaries", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + + # Run CSV transform within kubernetes pod + wind_hourly_summary = kubernetes_engine.GKEStartPodOperator( + task_id="wind_hourly_summary", + startup_timeout_seconds=600, + name="load_data", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="epa-hist-air-quality", + image_pull_policy="Always", + image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_URL": "{{ var.json.epa_historical_air_quality.wind_hourly_summary.source_url }}", + "START_YEAR": "1990", + "SOURCE_FILE": "files/data.csv", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "{{ var.json.epa_historical_air_quality.dataset_id }}", + "TABLE_ID": "{{ var.json.epa_historical_air_quality.wind_hourly_summary.table_id }}", + "YEAR_FIELD_NAME": "date_local", + "YEAR_FIELD_TYPE": "DATETIME", + "SCHEMA_PATH": "{{ var.json.epa_historical_air_quality.wind_hourly_summary.schema_path }}", + "CHUNKSIZE": "{{ var.json.epa_historical_air_quality.wind_hourly_summary.chunk_size }}", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "{{ var.json.epa_historical_air_quality.wind_hourly_summary.target_gcs_path }}", + "PIPELINE_NAME": "epa_historical_air_quality - wind_hourly_summary", + "INPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', + "OUTPUT_CSV_HEADERS": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', + }, + resources={"request_ephemeral_storage": "16G", "request_cpu": "1"}, + ) + delete_cluster = kubernetes_engine.GKEDeleteClusterOperator( + task_id="delete_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + name="epa-hist-air-quality", + ) + + ( + create_cluster + >> [ + annual_summaries, + co_daily_summary, + co_hourly_summary, + hap_daily_summary, + hap_hourly_summary, + lead_daily_summary, + no2_daily_summary, + no2_hourly_summary, + nonoxnoy_daily_summary, + nonoxnoy_hourly_summary, + ozone_daily_summary, + ozone_hourly_summary, + pm10_daily_summary, + pm10_hourly_summary, + pm25_frm_hourly_summary, + pm25_nonfrm_daily_summary, + pm25_nonfrm_hourly_summary, + pm25_speciation_daily_summary, + pm25_speciation_hourly_summary, + pressure_daily_summary, + pressure_hourly_summary, + rh_and_dp_daily_summary, + rh_and_dp_hourly_summary, + so2_daily_summary, + so2_hourly_summary, + temperature_daily_summary, + temperature_hourly_summary, + voc_daily_summary, + voc_hourly_summary, + wind_daily_summary, + wind_hourly_summary, + ] + >> delete_cluster + ) diff --git a/datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/pipeline.yaml new file mode 100644 index 000000000..fc13b8052 --- /dev/null +++ b/datasets/epa_historical_air_quality/pipelines/epa_historical_air_quality/pipeline.yaml @@ -0,0 +1,1699 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + - type: bigquery_table + table_id: "annual_summaries" + description: "epaspc" + - type: bigquery_table + table_id: "co_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "co_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "hap_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "hap_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "lead_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "no2_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "no2_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "nonoxnoy_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "nonoxnoy_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "ozone_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "ozone_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pm10_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pm10_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pm25_frm_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pm25_nonfrm_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pm25_nonfrm_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pm25_speciation_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pm25_speciation_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pressure_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "pressure_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "rh_and_dp_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "rh_and_dp_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "so2_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "so2_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "temperature_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "temperature_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "voc_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "voc_hourly_summary" + description: "epaspc" + - type: bigquery_table + table_id: "wind_daily_summary" + description: "epaspc" + - type: bigquery_table + table_id: "wind_hourly_summary" + description: "epaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: epa_historical_air_quality + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 1 * * 6" + catchup: False + default_view: graph + + tasks: + - operator: "GKECreateClusterOperator" + args: + task_id: "create_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + body: + name: epa-hist-air-quality + initial_node_count: 8 + network: "{{ var.value.vpc_network }}" + node_config: + machine_type: e2-standard-16 + oauth_scopes: + - https://www.googleapis.com/auth/devstorage.read_write + - https://www.googleapis.com/auth/cloud-platform + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "annual_summaries" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.annual_summary.source_url }}" + START_YEAR: "1980" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.annual_summary.table_id }}" + YEAR_FIELD_NAME: "year" + YEAR_FIELD_TYPE: "INT" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.annual_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.annual_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.annual_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - annual_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", + "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", + "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", + "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", + "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", + "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", + "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", + "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", + "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "metric_used": "str", "method_name": "str", "year": "int32", "units_of_measure": "str", + "event_type": "str", "observation_count": "int32", "observation_percent": "float64", "completeness_indicator": "str", "valid_day_count": "int32", + "required_day_count": "int32", "exceptional_data_count": "int32", "null_data_count": "int32", "primary_exceedance_count": "str", "secondary_exceedance_count": "str", + "certification_indicator": "str", "num_obs_below_mdl": "int32", "arithmetic_mean": "float64", "arithmetic_standard_dev": "float64", "first_max_value": "float64", + "first_max_datetime": "datetime64[ns]", "second_max_value": "float64", "second_max_datetime": "datetime64[ns]", "third_max_value": "float64", "third_max_datetime": "datetime64[ns]", + "fourth_max_value": "float64", "fourth_max_datetime": "datetime64[ns]", "first_max_non_overlapping_value": "float64", "first_no_max_datetime": "datetime64[ns]", "second_max_non_overlapping_value": "float64", + "second_no_max_datetime": "datetime64[ns]", "ninety_nine_percentile": "float64", "ninety_eight_percentile": "float64", "ninety_five_percentile": "float64", "ninety_percentile": "float64", + "seventy_five_percentile": "float64", "fifty_percentile": "float64", "ten_percentile": "float64", "local_site_name": "str", "address": "str", + "state_name": "str", "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "metric_used", "method_name", "year", "units_of_measure", + "event_type", "observation_count", "observation_percent", "completeness_indicator", "valid_day_count", + "required_day_count", "exceptional_data_count", "null_data_count", "primary_exceedance_count", "secondary_exceedance_count", + "certification_indicator", "num_obs_below_mdl", "arithmetic_mean", "arithmetic_standard_dev", "first_max_value", + "first_max_datetime", "second_max_value", "second_max_datetime", "third_max_value", "third_max_datetime", + "fourth_max_value", "fourth_max_datetime", "first_max_non_overlapping_value", "first_no_max_datetime", "second_max_non_overlapping_value", + "second_no_max_datetime", "ninety_nine_percentile", "ninety_eight_percentile", "ninety_five_percentile", "ninety_percentile", + "seventy_five_percentile", "fifty_percentile", "ten_percentile", "local_site_name", "address", + "state_name", "county_name", "city_name", "cbsa_name", "date_of_last_change"] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "co_daily_summary" + startup_timeout_seconds: 600 + name: "load_co_daily_summary" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.co_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.co_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.co_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.co_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.co_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - co_daily_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "str", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "co_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.co_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.co_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.co_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.co_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.co_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - co_hourly_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", + "method_name", "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "str", "longitude": "str", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", "time_local": "str", + "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "str", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "str", "qualifier": "str", "method_type": "str", "method_code": "str", + "method_name": "str", "state_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", + "method_name", "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "hap_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.hap_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.hap_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.hap_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.hap_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.hap_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - hap_daily_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "hap_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.hap_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.hap_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.hap_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.hap_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.hap_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - hap_hourly_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "lead_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.lead_daily_summary.source_url }}" + START_YEAR: "1980" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.lead_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.lead_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.lead_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.lead_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - lead_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "no2_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.no2_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.no2_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.no2_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.no2_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.no2_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - no2_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "no2_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.no2_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.no2_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.no2_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.no2_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.no2_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - no2_hourly" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "nonoxnoy_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.nonoxnoy_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - nonoxnoy_daily" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "nonoxnoy_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.nonoxnoy_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - nonoxnoy_hourly" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "ozone_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.ozone_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.ozone_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.ozone_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.ozone_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.ozone_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - ozone_daily_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "ozone_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.source_url }}" + START_YEAR: "1980" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.ozone_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - ozone_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pm10_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pm10_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pm10_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pm10_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pm10_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pm10_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pm10_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pm10_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pm10_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pm10_hourly_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pm25_frm_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.source_url }}" + START_YEAR: "1980" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pm25_frm_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pm25_frm_hourly_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pm25_nonfrm_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.source_url }}" + START_YEAR: "1980" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pm25_nonfrm_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pm25_nonfrm_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pm25_nonfrm_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pm25_nonfrm_hourly_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pm25_speciation_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pm25_speciation_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pm25_speciation_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pm25_speciation_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pm25_speciation_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pm25_speciation_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pressure_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pressure_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pressure_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pressure_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pressure_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pressure_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pressure_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "pressure_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.pressure_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - pressure_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "rh_and_dp_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.rh_and_dp_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - rh_and_dp_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "rh_and_dp_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.rh_and_dp_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - rh_and_dp_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "so2_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.so2_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.so2_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.so2_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.so2_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.so2_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - so2_daily_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "so2_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.so2_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.so2_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.so2_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.so2_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.so2_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - so2_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "temperature_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.temperature_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.temperature_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.temperature_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.temperature_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.temperature_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - temperature_daily_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "temperature_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.temperature_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - temperature_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "voc_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.voc_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.voc_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.voc_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.voc_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.voc_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - voc_daily_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "voc_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.voc_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.voc_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.voc_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.voc_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.voc_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - voc_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "wind_daily_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.wind_daily_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.wind_daily_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.wind_daily_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.wind_daily_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.wind_daily_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - wind_daily_summaries" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", + "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", + "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", + "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", + "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "sample_duration", + "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", + "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", + "method_code", "method_name", "local_site_name", "address", "state_name", + "county_name", "city_name", "cbsa_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEStartPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "wind_hourly_summary" + startup_timeout_seconds: 600 + name: "load_data" + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: epa-hist-air-quality + image_pull_policy: "Always" + image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_URL: "{{ var.json.epa_historical_air_quality.wind_hourly_summary.source_url }}" + START_YEAR: "1990" + SOURCE_FILE: "files/data.csv" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "{{ var.json.epa_historical_air_quality.dataset_id }}" + TABLE_ID: "{{ var.json.epa_historical_air_quality.wind_hourly_summary.table_id }}" + YEAR_FIELD_NAME: "date_local" + YEAR_FIELD_TYPE: "DATETIME" + SCHEMA_PATH: "{{ var.json.epa_historical_air_quality.wind_hourly_summary.schema_path }}" + CHUNKSIZE: "{{ var.json.epa_historical_air_quality.wind_hourly_summary.chunk_size }}" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "{{ var.json.epa_historical_air_quality.wind_hourly_summary.target_gcs_path }}" + PIPELINE_NAME: "epa_historical_air_quality - wind_hourly_summary" + INPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + DATA_DTYPES: >- + { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", + "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", + "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", + "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", + "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } + OUTPUT_CSV_HEADERS: >- + [ "state_code", "county_code", "site_num", "parameter_code", "poc", + "latitude", "longitude", "datum", "parameter_name", "date_local", + "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", + "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", + "state_name", "county_name", "date_of_last_change" ] + resources: + request_ephemeral_storage: "16G" + request_cpu: "1" + + - operator: "GKEDeleteClusterOperator" + args: + task_id: "delete_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + name: epa-hist-air-quality + + graph_paths: + - "create_cluster >> [ annual_summaries, co_daily_summary, co_hourly_summary, hap_daily_summary, hap_hourly_summary, lead_daily_summary, no2_daily_summary, no2_hourly_summary, nonoxnoy_daily_summary, nonoxnoy_hourly_summary, ozone_daily_summary, ozone_hourly_summary, pm10_daily_summary, pm10_hourly_summary, pm25_frm_hourly_summary, pm25_nonfrm_daily_summary, pm25_nonfrm_hourly_summary, pm25_speciation_daily_summary, pm25_speciation_hourly_summary, pressure_daily_summary, pressure_hourly_summary, rh_and_dp_daily_summary, rh_and_dp_hourly_summary, so2_daily_summary, so2_hourly_summary, temperature_daily_summary, temperature_hourly_summary, voc_daily_summary, voc_hourly_summary, wind_daily_summary, wind_hourly_summary ] >> delete_cluster" diff --git a/datasets/epa_historical_air_quality/pipelines/hap_daily_summary/hap_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/hap_daily_summary/hap_daily_summary_dag.py deleted file mode 100644 index c78023b63..000000000 --- a/datasets/epa_historical_air_quality/pipelines/hap_daily_summary/hap_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.hap_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 1 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="hap_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.hap_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/hap_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/hap_daily_summary/pipeline.yaml deleted file mode 100644 index 80b1c37bd..000000000 --- a/datasets/epa_historical_air_quality/pipelines/hap_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "hap_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: hap_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 1 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "hap_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_HAPS_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/hap_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.hap_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/hap_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/hap_hourly_summary_dag.py deleted file mode 100644 index aa2ed5983..000000000 --- a/datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/hap_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.hap_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 2 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="hap_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_YEAR_ITERATOR.zip", - "START_YEAR": "1993", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.hap_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/pipeline.yaml deleted file mode 100644 index 5c21760f7..000000000 --- a/datasets/epa_historical_air_quality/pipelines/hap_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "hap_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: hap_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 2 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "hap_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_HAPS_YEAR_ITERATOR.zip" - START_YEAR: "1993" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/hap_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.hap_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/lead_daily_summary/lead_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/lead_daily_summary/lead_daily_summary_dag.py deleted file mode 100644 index cb0d0c096..000000000 --- a/datasets/epa_historical_air_quality/pipelines/lead_daily_summary/lead_daily_summary_dag.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.lead_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 2 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="lead_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={"request_memory": "4G", "request_cpu": "1"}, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.lead_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/lead_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/lead_daily_summary/pipeline.yaml deleted file mode 100644 index 9ce5c1b05..000000000 --- a/datasets/epa_historical_air_quality/pipelines/lead_daily_summary/pipeline.yaml +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "lead_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: lead_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 2 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "lead_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_LEAD_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "4G" - request_cpu: "1" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/lead_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.lead_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/no2_daily_summary/no2_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/no2_daily_summary/no2_daily_summary_dag.py deleted file mode 100644 index e71d19d66..000000000 --- a/datasets/epa_historical_air_quality/pipelines/no2_daily_summary/no2_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.no2_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 3 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="no2_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42602_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.no2_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/no2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/no2_daily_summary/pipeline.yaml deleted file mode 100644 index 9b7cd9d73..000000000 --- a/datasets/epa_historical_air_quality/pipelines/no2_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "no2_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: no2_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 3 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "no2_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42602_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/no2_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.no2_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/no2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/no2_hourly_summary_dag.py deleted file mode 100644 index e89ef6d8e..000000000 --- a/datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/no2_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.no2_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 3 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="no2_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.no2_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/pipeline.yaml deleted file mode 100644 index d812df9dd..000000000 --- a/datasets/epa_historical_air_quality/pipelines/no2_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "no2_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: no2_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 3 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "no2_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42602_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/no2_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.no2_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py deleted file mode 100644 index e8885d56d..000000000 --- a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/nonoxnoy_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.nonoxnoy_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 4 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="nonoxnoy_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "750000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.nonoxnoy_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/pipeline.yaml deleted file mode 100644 index 5cae906cd..000000000 --- a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "nonoxnoy_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: nonoxnoy_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 4 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "nonoxnoy_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_NONOxNOy_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "750000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/nonoxnoy_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.nonoxnoy_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py deleted file mode 100644 index 6ee5047a8..000000000 --- a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/nonoxnoy_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.nonoxnoy_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 4 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="no2_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.nonoxnoy_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/pipeline.yaml deleted file mode 100644 index 3478dd2b4..000000000 --- a/datasets/epa_historical_air_quality/pipelines/nonoxnoy_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "nonoxnoy_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: nonoxnoy_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 4 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "no2_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_NONOxNOy_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/nonoxnoy_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.nonoxnoy_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/ozone_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/ozone_daily_summary_dag.py deleted file mode 100644 index 952618f78..000000000 --- a/datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/ozone_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.ozone_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 5 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="ozone_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_44201_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "750000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.ozone_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/pipeline.yaml deleted file mode 100644 index 0bf4aaf25..000000000 --- a/datasets/epa_historical_air_quality/pipelines/ozone_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "ozone_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: ozone_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 5 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "ozone_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_44201_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "750000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/ozone_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.ozone_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/ozone_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/ozone_hourly_summary_dag.py deleted file mode 100644 index aa9bb9d03..000000000 --- a/datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/ozone_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.ozone_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 5 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="ozone_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.ozone_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/pipeline.yaml deleted file mode 100644 index ff5018282..000000000 --- a/datasets/epa_historical_air_quality/pipelines/ozone_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "ozone_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: ozone_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 5 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "ozone_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/ozone_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.ozone_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pipeline.yaml deleted file mode 100644 index 4a54ce46f..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pm10_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pm10_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 6 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pm10_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_81102_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pm10_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pm10_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pm10_daily_summary_dag.py deleted file mode 100644 index efcbdc591..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm10_daily_summary/pm10_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pm10_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 6 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pm10_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_81102_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pm10_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pm10_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pipeline.yaml deleted file mode 100644 index 72f5253ab..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pm10_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pm10_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 6 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pm10_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pm10_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pm10_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pm10_hourly_summary_dag.py deleted file mode 100644 index 97d2a86b8..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm10_hourly_summary/pm10_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pm10_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 6 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pm10_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_44201_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pm10_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pm10_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pipeline.yaml deleted file mode 100644 index 26b6b3d4c..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pm25_frm_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pm25_frm_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 7 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pm25_frm_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pm25_frm_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py deleted file mode 100644 index 1126982ff..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_frm_hourly_summary/pm25_frm_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pm25_frm_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 7 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pm25_frm_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88101_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pm25_frm_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pm25_frm_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pipeline.yaml deleted file mode 100644 index df2f29802..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pm25_nonfrm_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pm25_nonfrm_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 7 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pm25_nonfrm_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_88502_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pm25_nonfrm_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py deleted file mode 100644 index d0382601b..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_daily_summary/pm25_nonfrm_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pm25_nonfrm_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 7 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pm25_nonfrm_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_88502_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pm25_nonfrm_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pm25_nonfrm_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pipeline.yaml deleted file mode 100644 index 98d8564be..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pm25_nonfrm_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pm25_nonfrm_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 8 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pm25_nonfrm_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pm25_nonfrm_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py deleted file mode 100644 index da3533490..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_nonfrm_hourly_summary/pm25_nonfrm_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pm25_nonfrm_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 8 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pm25_nonfrm_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_88502_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pm25_nonfrm_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pm25_nonfrm_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pipeline.yaml deleted file mode 100644 index 4dfc565ec..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pm25_speciation_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pm25_speciation_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 8 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pm25_speciation_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pm25_speciation_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py deleted file mode 100644 index 87aa22f81..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_daily_summary/pm25_speciation_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pm25_speciation_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 8 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pm25_speciation_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_SPEC_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pm25_speciation_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pm25_speciation_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pipeline.yaml deleted file mode 100644 index 4f62088f4..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pm25_speciation_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pm25_speciation_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 9 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pm25_speciation_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pm25_speciation_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py deleted file mode 100644 index eb1fa7c82..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pm25_speciation_hourly_summary/pm25_speciation_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pm25_speciation_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 9 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pm25_speciation_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_SPEC_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pm25_speciation_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pm25_speciation_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pipeline.yaml deleted file mode 100644 index ba81057e9..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pressure_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pressure_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 9 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pressure_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pressure_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pressure_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pressure_daily_summary_dag.py deleted file mode 100644 index 57b519dff..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pressure_daily_summary/pressure_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pressure_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 9 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pressure_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_PRESS_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pressure_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pressure_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pipeline.yaml deleted file mode 100644 index 4e9e1d214..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "pressure_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: pressure_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 10 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "pressure_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.pressure_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pressure_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pressure_hourly_summary_dag.py deleted file mode 100644 index cc0676430..000000000 --- a/datasets/epa_historical_air_quality/pipelines/pressure_hourly_summary/pressure_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.pressure_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 10 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="pressure_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_PRESS_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/pressure_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.pressure_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/pipeline.yaml deleted file mode 100644 index e1dd45181..000000000 --- a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "rh_and_dp_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: rh_and_dp_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 10 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "rh_and_dp_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.rh_and_dp_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py deleted file mode 100644 index 005e5d9f4..000000000 --- a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_daily_summary/rh_and_dp_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.rh_and_dp_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 10 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="rh_and_dp_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_RH_DP_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/rh_and_dp_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.rh_and_dp_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/pipeline.yaml deleted file mode 100644 index b309050de..000000000 --- a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "rh_and_dp_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: rh_and_dp_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 11 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "rh_and_dp_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.rh_and_dp_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py deleted file mode 100644 index 100f89acf..000000000 --- a/datasets/epa_historical_air_quality/pipelines/rh_and_dp_hourly_summary/rh_and_dp_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.rh_and_dp_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 11 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="rh_and_dp_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_RH_DP_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/rh_and_dp_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.rh_and_dp_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/so2_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/so2_daily_summary/pipeline.yaml deleted file mode 100644 index 88c3c3d5d..000000000 --- a/datasets/epa_historical_air_quality/pipelines/so2_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "so2_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: so2_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 11 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "so2_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_42401_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.so2_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/so2_daily_summary/so2_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/so2_daily_summary/so2_daily_summary_dag.py deleted file mode 100644 index b08928853..000000000 --- a/datasets/epa_historical_air_quality/pipelines/so2_daily_summary/so2_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.so2_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 11 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="so2_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_42401_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/so2_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.so2_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/pipeline.yaml deleted file mode 100644 index c7153273e..000000000 --- a/datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "so2_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: so2_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 12 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "so2_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.so2_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/so2_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/so2_hourly_summary_dag.py deleted file mode 100644 index c0bf10101..000000000 --- a/datasets/epa_historical_air_quality/pipelines/so2_hourly_summary/so2_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.so2_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 12 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="so2_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_42401_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/so2_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.so2_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/pipeline.yaml deleted file mode 100644 index 15b0b1c17..000000000 --- a/datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "temperature_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: temperature_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 12 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "temperature_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "750000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.temperature_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/temperature_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/temperature_daily_summary_dag.py deleted file mode 100644 index d5d19b3fc..000000000 --- a/datasets/epa_historical_air_quality/pipelines/temperature_daily_summary/temperature_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.temperature_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 12 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="temperature_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_TEMP_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "750000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/temperature_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.temperature_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/pipeline.yaml deleted file mode 100644 index 7a224c293..000000000 --- a/datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "temperature_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: temperature_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 13 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "temperature_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.temperature_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/temperature_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/temperature_hourly_summary_dag.py deleted file mode 100644 index 5fe48db6f..000000000 --- a/datasets/epa_historical_air_quality/pipelines/temperature_hourly_summary/temperature_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.temperature_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 13 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="temperature_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/temperature_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.temperature_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/voc_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/voc_daily_summary/pipeline.yaml deleted file mode 100644 index 4ad68e602..000000000 --- a/datasets/epa_historical_air_quality/pipelines/voc_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "voc_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: voc_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "30 13 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "voc_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.voc_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/voc_daily_summary/voc_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/voc_daily_summary/voc_daily_summary_dag.py deleted file mode 100644 index cba70d8f7..000000000 --- a/datasets/epa_historical_air_quality/pipelines/voc_daily_summary/voc_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.voc_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="30 13 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="voc_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_VOCS_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/voc_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.voc_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/pipeline.yaml deleted file mode 100644 index 2bccd8283..000000000 --- a/datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "voc_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: voc_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 14 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "voc_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.voc_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/voc_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/voc_hourly_summary_dag.py deleted file mode 100644 index abeedfb39..000000000 --- a/datasets/epa_historical_air_quality/pipelines/voc_hourly_summary/voc_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.voc_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 14 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="voc_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_VOCS_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/voc_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.voc_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/wind_daily_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/wind_daily_summary/pipeline.yaml deleted file mode 100644 index 8542472df..000000000 --- a/datasets/epa_historical_air_quality/pipelines/wind_daily_summary/pipeline.yaml +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "wind_daily_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: wind_daily_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "0 15 * * *" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "wind_daily_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "750000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "sample_duration", - "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count", - "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi", - "method_code", "method_name", "local_site_name", "address", "state_name", - "county_name", "city_name", "cbsa_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str", - "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32", - "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str", - "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str", - "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.wind_daily_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "sample_duration" - "type": "STRING" - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour)." - "mode": "NULLABLE" - - "name": "pollutant_standard" - "type": "STRING" - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)" - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "event_type" - "type": "STRING" - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor." - "mode": "NULLABLE" - - "name": "observation_count" - "type": "INTEGER" - "description": "The number of observations (samples) taken during the day." - "mode": "NULLABLE" - - "name": "observation_percent" - "type": "FLOAT" - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters)." - "mode": "NULLABLE" - - "name": "arithmetic_mean" - "type": "FLOAT" - "description": "The average (arithmetic mean) value for the day." - "mode": "NULLABLE" - - "name": "first_max_value" - "type": "FLOAT" - "description": "The highest value for the day." - "mode": "NULLABLE" - - "name": "first_max_hour" - "type": "INTEGER" - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken." - "mode": "NULLABLE" - - "name": "aqi" - "type": "INTEGER" - "description": "The Air Quality Index for the day for the pollutant, if applicable." - "mode": "NULLABLE" - - "name": "method_code" - "type": "INTEGER" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "local_site_name" - "type": "STRING" - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it." - "mode": "NULLABLE" - - "name": "address" - "type": "STRING" - "description": "The approximate street address of the monitoring site." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "city_name" - "type": "STRING" - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas." - "mode": "NULLABLE" - - "name": "cbsa_name" - "type": "STRING" - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/wind_daily_summary/wind_daily_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/wind_daily_summary/wind_daily_summary_dag.py deleted file mode 100644 index 8d4193591..000000000 --- a/datasets/epa_historical_air_quality/pipelines/wind_daily_summary/wind_daily_summary_dag.py +++ /dev/null @@ -1,252 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.wind_daily_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="0 15 * * *", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="wind_daily_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/daily_WIND_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "750000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "sample_duration",\n "pollutant_standard", "date_local", "units_of_measure", "event_type", "observation_count",\n "observation_percent", "arithmetic_mean", "first_max_value", "first_max_hour", "aqi",\n "method_code", "method_name", "local_site_name", "address", "state_name",\n "county_name", "city_name", "cbsa_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "sample_duration": "str",\n "pollutant_standard": "str", "date_local": "datetime64[ns]", "units_of_measure": "str", "event_type": "str", "observation_count": "int32",\n "observation_percent": "float64", "arithmetic_mean": "float64", "first_max_value": "float64", "first_max_hour": "int32", "aqi": "str",\n "method_code": "str", "method_name": "str", "local_site_name": "str", "address": "str", "state_name": "str",\n "county_name": "str", "city_name": "str", "cbsa_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/wind_daily_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.wind_daily_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "sample_duration", - "type": "STRING", - "description": "The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).", - "mode": "NULLABLE", - }, - { - "name": "pollutant_standard", - "type": "STRING", - "description": "A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "event_type", - "type": "STRING", - "description": "Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.", - "mode": "NULLABLE", - }, - { - "name": "observation_count", - "type": "INTEGER", - "description": "The number of observations (samples) taken during the day.", - "mode": "NULLABLE", - }, - { - "name": "observation_percent", - "type": "FLOAT", - "description": "The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).", - "mode": "NULLABLE", - }, - { - "name": "arithmetic_mean", - "type": "FLOAT", - "description": "The average (arithmetic mean) value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_value", - "type": "FLOAT", - "description": "The highest value for the day.", - "mode": "NULLABLE", - }, - { - "name": "first_max_hour", - "type": "INTEGER", - "description": "The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.", - "mode": "NULLABLE", - }, - { - "name": "aqi", - "type": "INTEGER", - "description": "The Air Quality Index for the day for the pollutant, if applicable.", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "INTEGER", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "local_site_name", - "type": "STRING", - "description": "The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.", - "mode": "NULLABLE", - }, - { - "name": "address", - "type": "STRING", - "description": "The approximate street address of the monitoring site.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "city_name", - "type": "STRING", - "description": "The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.", - "mode": "NULLABLE", - }, - { - "name": "cbsa_name", - "type": "STRING", - "description": "The name of the core bases statistical area (metropolitan area) where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/pipeline.yaml b/datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/pipeline.yaml deleted file mode 100644 index 8a6e18a64..000000000 --- a/datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/pipeline.yaml +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - table_id: "wind_hourly_summary" - description: "epaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: wind_hourly_summary - default_args: - owner: "Google" - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - - operator: "KubernetesPodOperator" - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - name: "wind_hourly_summary" - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - image: "{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}" - env_vars: - SOURCE_URL: "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_YEAR_ITERATOR.zip" - START_YEAR: "1990" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - CHUNKSIZE: "2500000" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv" - DATA_NAMES: >- - [ "state_code", "county_code", "site_num", "parameter_code", "poc", - "latitude", "longitude", "datum", "parameter_name", "date_local", - "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure", - "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name", - "state_name", "county_name", "date_of_last_change" ] - DATA_DTYPES: >- - { "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32", - "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]", - "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str", - "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str", - "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" } - resources: - request_memory: "8G" - request_cpu: "3" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - bucket: "{{ var.value.composer_bucket }}" - source_objects: ["data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "{{ var.json.epa_historical_air_quality.destination_tables.wind_hourly_summary }}" - skip_leading_rows: 1 - allow_quoted_newlines: True - write_disposition: "WRITE_TRUNCATE" - schema_fields: - - "name": "state_code" - "type": "STRING" - "description": "The FIPS code of the state in which the monitor resides." - "mode": "NULLABLE" - - "name": "county_code" - "type": "STRING" - "description": "The FIPS code of the county in which the monitor resides." - "mode": "NULLABLE" - - "name": "site_num" - "type": "STRING" - "description": "A unique number within the county identifying the site." - "mode": "NULLABLE" - - "name": "parameter_code" - "type": "INTEGER" - "description": "The AQS code corresponding to the parameter measured by the monitor." - "mode": "NULLABLE" - - "name": "poc" - "type": "INTEGER" - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site." - "mode": "NULLABLE" - - "name": "latitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees." - "mode": "NULLABLE" - - "name": "longitude" - "type": "FLOAT" - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees." - "mode": "NULLABLE" - - "name": "datum" - "type": "STRING" - "description": "The Datum associated with the Latitude and Longitude measures." - "mode": "NULLABLE" - - "name": "parameter_name" - "type": "STRING" - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants." - "mode": "NULLABLE" - - "name": "date_local" - "type": "TIMESTAMP" - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor." - "mode": "NULLABLE" - - "name": "time_local" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time." - "mode": "NULLABLE" - - "name": "date_gmt" - "type": "TIMESTAMP" - "description": "The calendar date of the sample in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "time_gmt" - "type": "STRING" - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time." - "mode": "NULLABLE" - - "name": "sample_measurement" - "type": "FLOAT" - "description": "The measured value in the standard units of measure for the parameter." - "mode": "NULLABLE" - - "name": "units_of_measure" - "type": "STRING" - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations." - "mode": "NULLABLE" - - "name": "mdl" - "type": "FLOAT" - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL." - "mode": "NULLABLE" - - "name": "uncertainty" - "type": "FLOAT" - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency." - "mode": "NULLABLE" - - "name": "qualifier" - "type": "STRING" - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field." - "mode": "NULLABLE" - - "name": "method_type" - "type": "STRING" - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method)." - "mode": "NULLABLE" - - "name": "method_code" - "type": "STRING" - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column." - "mode": "NULLABLE" - - "name": "method_name" - "type": "STRING" - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample." - "mode": "NULLABLE" - - "name": "state_name" - "type": "STRING" - "description": "The name of the state where the monitoring site is located." - "mode": "NULLABLE" - - "name": "county_name" - "type": "STRING" - "description": "The name of the county where the monitoring site is located." - "mode": "NULLABLE" - - "name": "date_of_last_change" - "type": "TIMESTAMP" - "description": "The date the last time any numeric values in this record were updated in the AQS data system." - "mode": "NULLABLE" - - graph_paths: - - "transform_csv >> load_to_bq" diff --git a/datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/wind_hourly_summary_dag.py b/datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/wind_hourly_summary_dag.py deleted file mode 100644 index 866090954..000000000 --- a/datasets/epa_historical_air_quality/pipelines/wind_hourly_summary/wind_hourly_summary_dag.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="epa_historical_air_quality.wind_hourly_summary", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="wind_hourly_summary", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.epa_historical_air_quality.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://aqs.epa.gov/aqsweb/airdata/hourly_WIND_YEAR_ITERATOR.zip", - "START_YEAR": "1990", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "CHUNKSIZE": "2500000", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv", - "DATA_NAMES": '[ "state_code", "county_code", "site_num", "parameter_code", "poc",\n "latitude", "longitude", "datum", "parameter_name", "date_local",\n "time_local", "date_gmt", "time_gmt", "sample_measurement", "units_of_measure",\n "mdl", "uncertainty", "qualifier", "method_type", "method_code", "method_name",\n "state_name", "county_name", "date_of_last_change" ]', - "DATA_DTYPES": '{ "state_code": "str", "county_code": "str", "site_num": "str", "parameter_code": "int32", "poc": "int32",\n "latitude": "float64", "longitude": "float64", "datum": "str", "parameter_name": "str", "date_local": "datetime64[ns]",\n "time_local": "str", "date_gmt": "datetime64[ns]", "time_gmt": "str", "sample_measurement": "float64", "units_of_measure": "str",\n "mdl": "float64", "uncertainty": "float64", "qualifier": "str", "method_type": "str", "method_code": "int32", "method_name": "str",\n "state_name": "str", "county_name": "str", "date_of_last_change": "datetime64[ns]" }', - }, - resources={ - "request_memory": "8G", - "request_cpu": "3", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/epa_historical_air_quality/wind_hourly_summary/files/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="{{ var.json.epa_historical_air_quality.destination_tables.wind_hourly_summary }}", - skip_leading_rows=1, - allow_quoted_newlines=True, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "state_code", - "type": "STRING", - "description": "The FIPS code of the state in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "county_code", - "type": "STRING", - "description": "The FIPS code of the county in which the monitor resides.", - "mode": "NULLABLE", - }, - { - "name": "site_num", - "type": "STRING", - "description": "A unique number within the county identifying the site.", - "mode": "NULLABLE", - }, - { - "name": "parameter_code", - "type": "INTEGER", - "description": "The AQS code corresponding to the parameter measured by the monitor.", - "mode": "NULLABLE", - }, - { - "name": "poc", - "type": "INTEGER", - "description": "This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.", - "mode": "NULLABLE", - }, - { - "name": "latitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance north of the equator measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "longitude", - "type": "FLOAT", - "description": "The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.", - "mode": "NULLABLE", - }, - { - "name": "datum", - "type": "STRING", - "description": "The Datum associated with the Latitude and Longitude measures.", - "mode": "NULLABLE", - }, - { - "name": "parameter_name", - "type": "STRING", - "description": "The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.", - "mode": "NULLABLE", - }, - { - "name": "date_local", - "type": "TIMESTAMP", - "description": "The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.", - "mode": "NULLABLE", - }, - { - "name": "time_local", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Local Standard Time.", - "mode": "NULLABLE", - }, - { - "name": "date_gmt", - "type": "TIMESTAMP", - "description": "The calendar date of the sample in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "time_gmt", - "type": "STRING", - "description": "The time of day that sampling began on a 24-hour clock in Greenwich Mean Time.", - "mode": "NULLABLE", - }, - { - "name": "sample_measurement", - "type": "FLOAT", - "description": "The measured value in the standard units of measure for the parameter.", - "mode": "NULLABLE", - }, - { - "name": "units_of_measure", - "type": "STRING", - "description": "The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.", - "mode": "NULLABLE", - }, - { - "name": "mdl", - "type": "FLOAT", - "description": "The Method Detection Limit. The minimum sample concentration detectable for the monitor and method. Note: if samples are reported below this level, they may have been replaced by 1/2 the MDL.", - "mode": "NULLABLE", - }, - { - "name": "uncertainty", - "type": "FLOAT", - "description": "The total measurement uncertainty associated with a reported measurement as indicated by the reporting agency.", - "mode": "NULLABLE", - }, - { - "name": "qualifier", - "type": "STRING", - "description": "Sample values may have qualifiers that indicate why they are missing or that they are out of the ordinary. Types of qualifiers are: null data, exceptional event, natural events, and quality assurance. The highest ranking qualifier, if any, is described in this field.", - "mode": "NULLABLE", - }, - { - "name": "method_type", - "type": "STRING", - "description": "An indication of whether the method used to collect the data is a federal reference method (FRM), equivalent to a federal reference method, an approved regional method, or none of the above (non-federal reference method).", - "mode": "NULLABLE", - }, - { - "name": "method_code", - "type": "STRING", - "description": "An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.", - "mode": "NULLABLE", - }, - { - "name": "method_name", - "type": "STRING", - "description": "A short description of the processes, equipment, and protocols used in gathering and measuring the sample.", - "mode": "NULLABLE", - }, - { - "name": "state_name", - "type": "STRING", - "description": "The name of the state where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "county_name", - "type": "STRING", - "description": "The name of the county where the monitoring site is located.", - "mode": "NULLABLE", - }, - { - "name": "date_of_last_change", - "type": "TIMESTAMP", - "description": "The date the last time any numeric values in this record were updated in the AQS data system.", - "mode": "NULLABLE", - }, - ], - ) - - transform_csv >> load_to_bq