diff --git a/.github/linters/.flake8 b/.github/linters/.flake8 new file mode 100644 index 00000000..a6578a30 --- /dev/null +++ b/.github/linters/.flake8 @@ -0,0 +1,2 @@ +[flake8] +extend-ignore = E501 diff --git a/.github/linters/.sqlfluff b/.github/linters/.sqlfluff new file mode 100644 index 00000000..6494da04 --- /dev/null +++ b/.github/linters/.sqlfluff @@ -0,0 +1,24 @@ +[sqlfluff] +dialect = bigquery +max_line_length = 200 + +[sqlfluff:indentation] +indent_unit = space +tab_space_size = 2 +indented_using_on = False +allow_implicit_indents = True + +[sqlfluff:layout:type:binary_operator] +line_position = trailing + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper + +[sqlfluff:rules:capitalisation.types] +extended_capitalisation_policy = upper + +[sqlfluff:rules:convention.count_rows] +prefer_count_0 = True + +[sqlfluff:rules:convention.quoted_literals] +preferred_quoted_literal_style = single_quotes diff --git a/infra/bigquery-export/package-lock.json b/infra/bigquery-export/package-lock.json index a0d3c67f..2414e97e 100644 --- a/infra/bigquery-export/package-lock.json +++ b/infra/bigquery-export/package-lock.json @@ -697,9 +697,9 @@ } }, "node_modules/glob": { - "version": "10.4.5", - "resolved": "https://registry.npmjs.org/glob/-/glob-10.4.5.tgz", - "integrity": "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==", + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/glob/-/glob-10.5.0.tgz", + "integrity": "sha512-DfXN8DfhJ7NH3Oe7cFmu3NCu1wKbkReJ8TorzSAFbSKrlNaQSKfIzqYqVY8zlbs2NLBbWpRiU52GX2PbaBVNkg==", "license": "ISC", "dependencies": { "foreground-child": "^3.1.0", diff --git a/workspace/project_options.sql b/workspace/project_options.sql index 0bace374..356ebcd9 100644 --- a/workspace/project_options.sql +++ b/workspace/project_options.sql @@ -1,4 +1,10 @@ -SELECT * FROM `httparchive.region-us.INFORMATION_SCHEMA.EFFECTIVE_PROJECT_OPTIONS`; +SELECT + project_id, + option_name, + option_value, + option_type + +FROM `httparchive.region-us.INFORMATION_SCHEMA.EFFECTIVE_PROJECT_OPTIONS`; ALTER PROJECT httparchive SET OPTIONS ( `region-us.default_sql_dialect_option` = 'only_google_sql', diff --git a/workspace/restore_data.ipynb b/workspace/restore_data.ipynb index 7acf6343..9dc5d268 100644 --- a/workspace/restore_data.ipynb +++ b/workspace/restore_data.ipynb @@ -1,5 +1,18 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e6f1073c", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Utilities for restoring deleted BigQuery datasets and tables.\"\"\"\n", + "from google.cloud import bigquery\n", + "\n", + "client = bigquery.Client()\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -10,16 +23,11 @@ }, "outputs": [], "source": [ - "# sql_engine: bigquery\n", - "# output_variable: df\n", - "# start _sql\n", - "_sql = \"\"\"\n", + "QUERY = \"\"\"\n", "## [Restore deleted dataset](https://docs.cloud.google.com/bigquery/docs/restore-deleted-datasets#restore_a_dataset)\n", "UNDROP SCHEMA httparchive.crawl;\n", - "\"\"\" # end _sql\n", - "from google.colab.sql import bigquery as _bqsqlcell\n", - "df = _bqsqlcell.run(_sql)\n", - "df" + "\"\"\"\n", + "client.query(QUERY).result()" ] }, { @@ -31,9 +39,10 @@ }, "outputs": [], "source": [ - "## [Restore deleted table](https://docs.cloud.google.com/bigquery/docs/restore-deleted-tables#restore_a_table)\n", + "# [Restore deleted table]\n", + "# https://docs.cloud.google.com/bigquery/docs/restore-deleted-tables#restore_a_table\n", "!date -d '2025-08-04 16:00:00.000000Z' +%s000\n", - "!bq cp httparchive.crawl.pages@1759670400000 httparchive.crawl_staging.pages_restored_20250804" + "!bq cp httparchive.crawl.pages@1759670400000 httparchive.crawl_staging.pages_restored_20250804\n" ] }, { @@ -46,19 +55,14 @@ }, "outputs": [], "source": [ - "# sql_engine: bigquery\n", - "# output_variable: df\n", - "# start _sql\n", - "_sql = \"\"\"\n", + "QUERY = \"\"\"\n", "## [Restore a table to a specific point in time](https://cloud.google.com/bigquery/docs/restore-tables#restoring_a_table_to_a_specific_point_in_time)\n", "CREATE TABLE httparchive.crawl_staging.pages_restored_20250804 AS\n", "SELECT *\n", "FROM httparchive.crawl.pages\n", " FOR SYSTEM_TIME AS OF TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR);\n", - "\"\"\" # end _sql\n", - "from google.colab.sql import bigquery as _bqsqlcell\n", - "df = _bqsqlcell.run(_sql)\n", - "df" + "\"\"\"\n", + "client.query(QUERY).result()" ] } ],