<a href="https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/fellow-vicuna/sql/util/bq_to_sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Almanac
CHAPTER = "privacy"
YEAR = "2024"

# BigQuery
GCP_PROJECT = "httparchive"

# Git
BRANCH_NAME = "{chapter}-sql-{year}".format(
    chapter=CHAPTER,
    year=YEAR
)

# SQL folder
folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(
    year=YEAR,
    chapter=CHAPTER
)

# Google Sheets
spreadsheet_name = "{chapter} (Web Almanac {year})".format(
    chapter=CHAPTER.capitalize(),
    year=YEAR
)

# Set to `None` to create new one or an existing spreadsheet URL.
existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0/edit'

In [2]:
# Download repo
!git clone -b $BRANCH_NAME https://github.com/HTTPArchive/almanac.httparchive.org.git

Cloning into 'almanac.httparchive.org'...
remote: Enumerating objects: 43942, done.[K
remote: Counting objects: 100% (5935/5935), done.[K
remote: Compressing objects: 100% (1535/1535), done.[K
remote: Total 43942 (delta 4709), reused 4950 (delta 4391), pack-reused 38007[K
Receiving objects: 100% (43942/43942), 384.14 MiB | 29.81 MiB/s, done.
Resolving deltas: 100% (29622/29622), done.
Updating files: 100% (5472/5472), done.


In [3]:
# Update local branch
!cd almanac.httparchive.org/ && git checkout $BRANCH_NAME && git pull

Already on 'privacy-sql-2024'
Your branch is up to date with 'origin/privacy-sql-2024'.
Already up to date.


In [4]:
# Authenticate
import google.auth
import os
from google.colab import auth
from google.cloud import bigquery

import gspread
from gspread_dataframe import set_with_dataframe

os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT
auth.authenticate_user()
credentials, project = google.auth.default()
client = bigquery.Client()
gc = gspread.authorize(credentials)

In [5]:
import glob
import re

# Build Sheets
try:
  ss = gc.open_by_url(existing_spreadsheet_url)
  print('Using existing spreadsheet:', ss.url)
except:
  ss = gc.create(spreadsheet_name)
  print('Created a new spreadsheet:', spreadsheet_name, ss.url)
existing_sheets = [s.title for s in ss.worksheets()]

file_match_include = r"number_of_websites_with_features_based_on_string_search.sql"+"|"+ \
    "number_of_websites_with_origin_trial_from_token.sql"

file_match_exclude = r"^$"

overwrite = False
dry_run = True
tb_processed_limit = 0.1

# Find matching .sql queries in folder and save to google sheet.
for filepath in glob.iglob(folder):
    filename = filepath.split('/')[-1]
    sheet_title = re.sub(r"(\.sql|[^a-zA-Z0-9]+)", " ", filename).strip().title()

    if re.search(file_match_include, filename) and not re.search(file_match_exclude, filename):

        print('Processing:', sheet_title)
        with open(filepath) as f:
            query = f.read()

        response = client.query(
            query,
            job_config = bigquery.QueryJobConfig(dry_run = True)
        )

        tb_processed = response.total_bytes_processed/1024/1024/1024/1024
        print(f"Total Tb billed:{tb_processed:9.3f}")

        if dry_run:
            continue

        if tb_processed > tb_processed_limit:
            print('Data volume hit the limit. Skipping:', sheet_title)
            continue

        if sheet_title in existing_sheets:
            if not overwrite:
                print('Overwrite is False. Skipping:', sheet_title)
                continue

            else:
                st = ss.worksheet(sheet_title)
                ss.del_worksheet(st)

        df = client.query(query).to_dataframe()
        rows, cols = df.shape

        st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)
        set_with_dataframe(st, df)

    else:
        print('Not Matched. Skipping:', sheet_title)

Using existing spreadsheet: https://docs.google.com/spreadsheets/d/1U6DTYxxhDWf-39Fr0o1Jq2r1RUVa4EbyxIZu-wqrso0
