From 6817d77f70cab393d567cf62eee21e97f50b08d4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 Nov 2025 19:35:13 +0000 Subject: [PATCH 1/9] Bump glob from 10.4.5 to 10.5.0 in /infra/bigquery-export Bumps [glob](https://github.com/isaacs/node-glob) from 10.4.5 to 10.5.0. - [Changelog](https://github.com/isaacs/node-glob/blob/main/changelog.md) - [Commits](https://github.com/isaacs/node-glob/compare/v10.4.5...v10.5.0) --- updated-dependencies: - dependency-name: glob dependency-version: 10.5.0 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- infra/bigquery-export/package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/infra/bigquery-export/package-lock.json b/infra/bigquery-export/package-lock.json index a0d3c67f..2414e97e 100644 --- a/infra/bigquery-export/package-lock.json +++ b/infra/bigquery-export/package-lock.json @@ -697,9 +697,9 @@ } }, "node_modules/glob": { - "version": "10.4.5", - "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", - "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/glob/-/glob-10.5.0.tgz", + "integrity": "sha512-DfXN8DfhJ7NH3Oe7cFmu3NCu1wKbkReJ8TorzSAFbSKrlNaQSKfIzqYqVY8zlbs2NLBbWpRiU52GX2PbaBVNkg==", "license": "ISC", "dependencies": { "foreground-child": "^3.1.0", From cb08a28fd62e3cdeef91e398a4f9f01c010842de Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 20 Nov 2025 00:34:48 +0100 Subject: [PATCH 2/9] Add SQLFluff configuration for BigQuery dialect --- .github/linters/.sqlfluff | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .github/linters/.sqlfluff diff --git a/.github/linters/.sqlfluff b/.github/linters/.sqlfluff new file mode 100644 index 00000000..902989ce --- /dev/null +++ b/.github/linters/.sqlfluff @@ -0,0 +1,2 @@ +[sqlfluff] +dialect = bigquery From 2057ff0b58a3cd619433e46ad0b796adff4cfa34 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 20 Nov 2025 00:41:39 +0100 Subject: [PATCH 3/9] Refactor restore_data notebook to streamline SQL execution and remove unnecessary comments --- workspace/restore_data.ipynb | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/workspace/restore_data.ipynb b/workspace/restore_data.ipynb index 7acf6343..1f87a543 100644 --- a/workspace/restore_data.ipynb +++ b/workspace/restore_data.ipynb @@ -10,16 +10,14 @@ }, "outputs": [], "source": [ - "# sql_engine: bigquery\n", - "# output_variable: df\n", - "# start _sql\n", - "_sql = \"\"\"\n", + "from google.cloud import bigquery\n", + "\n", + "client = bigquery.Client()\n", + "sql = \"\"\"\n", "## [Restore deleted dataset](https://docs.cloud.google.com/bigquery/docs/restore-deleted-datasets#restore_a_dataset)\n", "UNDROP SCHEMA httparchive.crawl;\n", - "\"\"\" # end _sql\n", - "from google.colab.sql import bigquery as _bqsqlcell\n", - "df = _bqsqlcell.run(_sql)\n", - "df" + "\"\"\"\n", + "client.query(sql).result()" ] }, { @@ -46,19 +44,17 @@ }, "outputs": [], "source": [ - "# sql_engine: bigquery\n", - "# output_variable: df\n", - "# start _sql\n", - "_sql = \"\"\"\n", + "from google.cloud import bigquery\n", + "\n", + "client = bigquery.Client()\n", + "sql = \"\"\"\n", "## [Restore a table to a specific point in time](https://cloud.google.com/bigquery/docs/restore-tables#restoring_a_table_to_a_specific_point_in_time)\n", "CREATE TABLE httparchive.crawl_staging.pages_restored_20250804 AS\n", "SELECT *\n", "FROM httparchive.crawl.pages\n", " FOR SYSTEM_TIME AS OF TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR);\n", - "\"\"\" # end _sql\n", - "from google.colab.sql import bigquery as _bqsqlcell\n", - "df = _bqsqlcell.run(_sql)\n", - "df" + "\"\"\"\n", + "client.query(sql).result()" ] } ], From c3938e95086ad21aea4238ac7f8e52d2ee23cb84 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 20 Nov 2025 00:50:54 +0100 Subject: [PATCH 4/9] lint --- .github/linters/.sqlfluff | 7 +++++++ workspace/project_options.sql | 8 +++++++- workspace/restore_data.ipynb | 27 ++++++++++++++++----------- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/.github/linters/.sqlfluff b/.github/linters/.sqlfluff index 902989ce..f0b313c4 100644 --- a/.github/linters/.sqlfluff +++ b/.github/linters/.sqlfluff @@ -1,2 +1,9 @@ [sqlfluff] dialect = bigquery + +[sqlfluff:layout] +max_line_length = 200 +tab_space_size = 2 +comma_style = leading +indent_unit = spaces +indent_width = 2 diff --git a/workspace/project_options.sql b/workspace/project_options.sql index 0bace374..f17644e9 100644 --- a/workspace/project_options.sql +++ b/workspace/project_options.sql @@ -1,4 +1,10 @@ -SELECT * FROM `httparchive.region-us.INFORMATION_SCHEMA.EFFECTIVE_PROJECT_OPTIONS`; +SELECT + project_id, + option_name, + option_value, + option_type, + +FROM `httparchive.region-us.INFORMATION_SCHEMA.EFFECTIVE_PROJECT_OPTIONS`; ALTER PROJECT httparchive SET OPTIONS ( `region-us.default_sql_dialect_option` = 'only_google_sql', diff --git a/workspace/restore_data.ipynb b/workspace/restore_data.ipynb index 1f87a543..c1c51480 100644 --- a/workspace/restore_data.ipynb +++ b/workspace/restore_data.ipynb @@ -1,5 +1,16 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e6f1073c", + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import bigquery\n", + "client = bigquery.Client()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -10,14 +21,11 @@ }, "outputs": [], "source": [ - "from google.cloud import bigquery\n", - "\n", - "client = bigquery.Client()\n", - "sql = \"\"\"\n", + "QUERY = \"\"\"\n", "## [Restore deleted dataset](https://docs.cloud.google.com/bigquery/docs/restore-deleted-datasets#restore_a_dataset)\n", "UNDROP SCHEMA httparchive.crawl;\n", "\"\"\"\n", - "client.query(sql).result()" + "client.query(QUERY).result()" ] }, { @@ -29,7 +37,7 @@ }, "outputs": [], "source": [ - "## [Restore deleted table](https://docs.cloud.google.com/bigquery/docs/restore-deleted-tables#restore_a_table)\n", + "# [Restore deleted table](https://docs.cloud.google.com/bigquery/docs/restore-deleted-tables#restore_a_table)\n", "!date -d '2025-08-04 16:00:00.000000Z' +%s000\n", "!bq cp httparchive.crawl.pages@1759670400000 httparchive.crawl_staging.pages_restored_20250804" ] @@ -44,17 +52,14 @@ }, "outputs": [], "source": [ - "from google.cloud import bigquery\n", - "\n", - "client = bigquery.Client()\n", - "sql = \"\"\"\n", + "QUERY = \"\"\"\n", "## [Restore a table to a specific point in time](https://cloud.google.com/bigquery/docs/restore-tables#restoring_a_table_to_a_specific_point_in_time)\n", "CREATE TABLE httparchive.crawl_staging.pages_restored_20250804 AS\n", "SELECT *\n", "FROM httparchive.crawl.pages\n", " FOR SYSTEM_TIME AS OF TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR);\n", "\"\"\"\n", - "client.query(sql).result()" + "client.query(QUERY).result()" ] } ], From 084d58f2dc177f1657089e103a117a89d0fb2fd2 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 20 Nov 2025 01:12:30 +0100 Subject: [PATCH 5/9] Refactor SQLFluff configuration for BigQuery dialect to improve indentation settings and layout structure --- .github/linters/.sqlfluff | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/linters/.sqlfluff b/.github/linters/.sqlfluff index f0b313c4..123ebc05 100644 --- a/.github/linters/.sqlfluff +++ b/.github/linters/.sqlfluff @@ -1,9 +1,10 @@ [sqlfluff] dialect = bigquery - -[sqlfluff:layout] max_line_length = 200 + +[sqlfluff:indentation] +indent_unit = space tab_space_size = 2 -comma_style = leading -indent_unit = spaces -indent_width = 2 + +[sqlfluff:layout:type:comma] +line_position = leading From d27d24e224cd46e0d1f687cbc3e4b37b7626af8f Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Thu, 20 Nov 2025 01:14:16 +0100 Subject: [PATCH 6/9] Improve documentation and formatting in restore_data notebook --- workspace/restore_data.ipynb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workspace/restore_data.ipynb b/workspace/restore_data.ipynb index c1c51480..4c546998 100644 --- a/workspace/restore_data.ipynb +++ b/workspace/restore_data.ipynb @@ -7,8 +7,9 @@ "metadata": {}, "outputs": [], "source": [ + "\"\"\"Utilities for restoring deleted BigQuery datasets and tables.\"\"\"\n", "from google.cloud import bigquery\n", - "client = bigquery.Client()" + "client = bigquery.Client()\n" ] }, { @@ -37,9 +38,10 @@ }, "outputs": [], "source": [ - "# [Restore deleted table](https://docs.cloud.google.com/bigquery/docs/restore-deleted-tables#restore_a_table)\n", + "# [Restore deleted table]\n", + "# https://docs.cloud.google.com/bigquery/docs/restore-deleted-tables#restore_a_table\n", "!date -d '2025-08-04 16:00:00.000000Z' +%s000\n", - "!bq cp httparchive.crawl.pages@1759670400000 httparchive.crawl_staging.pages_restored_20250804" + "!bq cp httparchive.crawl.pages@1759670400000 httparchive.crawl_staging.pages_restored_20250804\n" ] }, { From 3dc391e790703c572db2d804b62b2ec0eac9b494 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 24 Nov 2025 23:38:56 +0100 Subject: [PATCH 7/9] Remove unnecessary line breaks in SQL and clean up SQLFluff configuration --- .github/linters/.sqlfluff | 3 --- workspace/project_options.sql | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/linters/.sqlfluff b/.github/linters/.sqlfluff index 123ebc05..b8880835 100644 --- a/.github/linters/.sqlfluff +++ b/.github/linters/.sqlfluff @@ -5,6 +5,3 @@ max_line_length = 200 [sqlfluff:indentation] indent_unit = space tab_space_size = 2 - -[sqlfluff:layout:type:comma] -line_position = leading diff --git a/workspace/project_options.sql b/workspace/project_options.sql index f17644e9..356ebcd9 100644 --- a/workspace/project_options.sql +++ b/workspace/project_options.sql @@ -2,8 +2,8 @@ SELECT project_id, option_name, option_value, - option_type, - + option_type + FROM `httparchive.region-us.INFORMATION_SCHEMA.EFFECTIVE_PROJECT_OPTIONS`; ALTER PROJECT httparchive SET OPTIONS ( From 65518f198b7db557ee6647b3d602ad9f917acbd6 Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 24 Nov 2025 23:41:32 +0100 Subject: [PATCH 8/9] Update SQLFluff configuration for improved linting rules and layout --- .github/linters/.sqlfluff | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/linters/.sqlfluff b/.github/linters/.sqlfluff index b8880835..6494da04 100644 --- a/.github/linters/.sqlfluff +++ b/.github/linters/.sqlfluff @@ -5,3 +5,20 @@ max_line_length = 200 [sqlfluff:indentation] indent_unit = space tab_space_size = 2 +indented_using_on = False +allow_implicit_indents = True + +[sqlfluff:layout:type:binary_operator] +line_position = trailing + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.types] +extended_capitalisation_policy = upper + +[sqlfluff:rules:convention.count_rows] +prefer_count_0 = True + +[sqlfluff:rules:convention.quoted_literals] +preferred_quoted_literal_style = single_quotes From e0bc3cc90ffae7f6f04493144b6b581842a4155a Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Mon, 24 Nov 2025 23:45:41 +0100 Subject: [PATCH 9/9] Add .flake8 configuration for flake8 linting and clean up restore_data notebook --- .github/linters/.flake8 | 2 ++ workspace/restore_data.ipynb | 1 + 2 files changed, 3 insertions(+) create mode 100644 .github/linters/.flake8 diff --git a/.github/linters/.flake8 b/.github/linters/.flake8 new file mode 100644 index 00000000..a6578a30 --- /dev/null +++ b/.github/linters/.flake8 @@ -0,0 +1,2 @@ +[flake8] +extend-ignore = E501 diff --git a/workspace/restore_data.ipynb b/workspace/restore_data.ipynb index 4c546998..9dc5d268 100644 --- a/workspace/restore_data.ipynb +++ b/workspace/restore_data.ipynb @@ -9,6 +9,7 @@ "source": [ "\"\"\"Utilities for restoring deleted BigQuery datasets and tables.\"\"\"\n", "from google.cloud import bigquery\n", + "\n", "client = bigquery.Client()\n" ] },