From 30d99c0f91c306667abdd4d1bb9ebf4bda6afc0f Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 9 May 2025 18:53:35 +0200 Subject: [PATCH 1/3] snapshots for f1 --- definitions/output/f1/pages_latest.js | 73 +++++++++++++++++++++ definitions/output/f1/requests_latest.js | 32 +++++++++ definitions/output/sample_data/pages_10k.js | 2 +- 3 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 definitions/output/f1/pages_latest.js create mode 100644 definitions/output/f1/requests_latest.js diff --git a/definitions/output/f1/pages_latest.js b/definitions/output/f1/pages_latest.js new file mode 100644 index 00000000..18a8bc00 --- /dev/null +++ b/definitions/output/f1/pages_latest.js @@ -0,0 +1,73 @@ +publish('pages_latest', { + type: 'table', + schema: 'f1', + description: 'The latest date from the crawl.pages table', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'rank', 'page'], + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +SET @@RESERVATION='projects/httparchive/locations/US/reservations/enterprise'; +`).query(ctx => ` +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + wptid, + TO_JSON_STRING(payload) AS payload, + TO_JSON_STRING(summary) AS summary, + STRUCT< + a11y STRING, + cms STRING, + cookies STRING, + css_variables STRING, + ecommerce STRING, + element_count STRING, + javascript STRING, + markup STRING, + media STRING, + origin_trials STRING, + performance STRING, + privacy STRING, + responsive_images STRING, + robots_txt STRING, + security STRING, + structured_data STRING, + third_parties STRING, + well_known STRING, + wpt_bodies STRING, + other STRING + > ( + TO_JSON_STRING(custom_metrics.a11y), + TO_JSON_STRING(custom_metrics.cms), + TO_JSON_STRING(custom_metrics.cookies), + TO_JSON_STRING(custom_metrics.css_variables), + TO_JSON_STRING(custom_metrics.ecommerce), + TO_JSON_STRING(custom_metrics.element_count), + TO_JSON_STRING(custom_metrics.javascript), + TO_JSON_STRING(custom_metrics.markup), + TO_JSON_STRING(custom_metrics.media), + TO_JSON_STRING(custom_metrics.origin_trials), + TO_JSON_STRING(custom_metrics.performance), + TO_JSON_STRING(custom_metrics.privacy), + TO_JSON_STRING(custom_metrics.responsive_images), + TO_JSON_STRING(custom_metrics.robots_txt), + TO_JSON_STRING(custom_metrics.security), + TO_JSON_STRING(custom_metrics.structured_data), + TO_JSON_STRING(custom_metrics.third_parties), + TO_JSON_STRING(custom_metrics.well_known), + TO_JSON_STRING(custom_metrics.wpt_bodies), + TO_JSON_STRING(custom_metrics.other) + ) AS custom_metrics, + TO_JSON_STRING(lighthouse) AS lighthouse, + features, + technologies, + TO_JSON_STRING(metadata) AS metadata +FROM ${ctx.ref('crawl', 'pages')} +WHERE + date = '${constants.currentMonth}' +`) diff --git a/definitions/output/f1/requests_latest.js b/definitions/output/f1/requests_latest.js new file mode 100644 index 00000000..ca60a107 --- /dev/null +++ b/definitions/output/f1/requests_latest.js @@ -0,0 +1,32 @@ +publish('requests_latest', { + type: 'table', + schema: 'f1', + description: 'The latest date from the crawl.requests table', + bigquery: { + partitionBy: 'date', + clusterBy: ['client', 'is_root_page', 'rank', 'type'], + }, + tags: ['crawl_complete'] +}).preOps(ctx => ` +SET @@RESERVATION='projects/httparchive/locations/US/reservations/enterprise'; +`).query(ctx => ` +SELECT + date, + client, + page, + is_root_page, + root_page, + rank, + url, + is_main_document, + type, + index, + TO_JSON_STRING(payload) AS payload, + TO_JSON_STRING(summary) AS summary, + request_headers, + response_headers, + response_body +FROM ${ctx.ref('crawl', 'requests')} +WHERE + date = '${constants.currentMonth}' +`) diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js index 8f2eab50..1d30eb16 100644 --- a/definitions/output/sample_data/pages_10k.js +++ b/definitions/output/sample_data/pages_10k.js @@ -3,7 +3,7 @@ publish('pages_10k', { schema: 'sample_data', bigquery: { partitionBy: 'date', - clusterBy: ['client', 'is_root_page', 'rank'] + clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, tags: ['crawl_complete'] }).query(ctx => ` From 2fd918aa2f1b3849448d81e172c6abc170abd9dc Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 9 May 2025 18:57:12 +0200 Subject: [PATCH 2/3] lint --- definitions/output/f1/pages_latest.js | 2 +- definitions/output/f1/requests_latest.js | 2 +- definitions/output/sample_data/pages_10k.js | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/definitions/output/f1/pages_latest.js b/definitions/output/f1/pages_latest.js index 18a8bc00..07a219b8 100644 --- a/definitions/output/f1/pages_latest.js +++ b/definitions/output/f1/pages_latest.js @@ -4,7 +4,7 @@ publish('pages_latest', { description: 'The latest date from the crawl.pages table', bigquery: { partitionBy: 'date', - clusterBy: ['client', 'is_root_page', 'rank', 'page'], + clusterBy: ['client', 'is_root_page', 'rank', 'page'] }, tags: ['crawl_complete'] }).preOps(ctx => ` diff --git a/definitions/output/f1/requests_latest.js b/definitions/output/f1/requests_latest.js index ca60a107..c9d12c74 100644 --- a/definitions/output/f1/requests_latest.js +++ b/definitions/output/f1/requests_latest.js @@ -4,7 +4,7 @@ publish('requests_latest', { description: 'The latest date from the crawl.requests table', bigquery: { partitionBy: 'date', - clusterBy: ['client', 'is_root_page', 'rank', 'type'], + clusterBy: ['client', 'is_root_page', 'rank', 'type'] }, tags: ['crawl_complete'] }).preOps(ctx => ` diff --git a/definitions/output/sample_data/pages_10k.js b/definitions/output/sample_data/pages_10k.js index 1d30eb16..0d8b67e6 100644 --- a/definitions/output/sample_data/pages_10k.js +++ b/definitions/output/sample_data/pages_10k.js @@ -9,6 +9,7 @@ publish('pages_10k', { }).query(ctx => ` SELECT * FROM ${ctx.ref('crawl', 'pages')} -WHERE date = '${constants.currentMonth}' AND - rank <= 10000 +WHERE + date = '${constants.currentMonth}' AND + rank <= 10000 `) From 97ffa12c62a1d91cc3a7324a60ecdf125a26e28e Mon Sep 17 00:00:00 2001 From: Max Ostapenko <1611259+max-ostapenko@users.noreply.github.com> Date: Fri, 9 May 2025 18:57:23 +0200 Subject: [PATCH 3/3] fix clusters --- definitions/output/sample_data/requests_10k.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/definitions/output/sample_data/requests_10k.js b/definitions/output/sample_data/requests_10k.js index c16e748b..0fcf1192 100644 --- a/definitions/output/sample_data/requests_10k.js +++ b/definitions/output/sample_data/requests_10k.js @@ -3,12 +3,13 @@ publish('requests_10k', { schema: 'sample_data', bigquery: { partitionBy: 'date', - clusterBy: ['client', 'is_root_page', 'is_main_document', 'type'] + clusterBy: ['client', 'is_root_page', 'rank', 'type'] }, tags: ['crawl_complete'] }).query(ctx => ` SELECT * FROM ${ctx.ref('crawl', 'requests')} -WHERE date = '${constants.currentMonth}' AND - rank <= 10000 +WHERE + date = '${constants.currentMonth}' AND + rank <= 10000 `)