<a href="https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/fellow-vicuna/sql/util/bq_to_sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# @title Configure the chapter
year = 2024 #@param {type: "integer"}
chapter = "privacy" #@param {type: "string"}

# BigQuery
GCP_PROJECT = "httparchive" #@param {type: "string"}

# Git
branch_name = f"{chapter.lower()}-sql-{year}"

# SQL folder
folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(
    year=year,
    chapter=chapter.lower()
)

# Google Sheets
spreadsheet_name = f"{chapter.capitalize()} (Web Almanac {year})"

# Set to `None` to create new one or an existing spreadsheet URL.
existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit' #@param {type: "string"}

In [None]:
# @title Download repo
!git clone -b $branch_name https://github.com/HTTPArchive/almanac.httparchive.org.git

In [12]:
# @title Update local branch (if new commits)
!cd almanac.httparchive.org/ && git checkout $branch_name && git pull

Already on 'privacy-sql-2024'
Your branch is up to date with 'origin/privacy-sql-2024'.
Already up to date.


In [13]:
# @title Authenticate
import google.auth
import os
from google.colab import auth
from google.cloud import bigquery

import gspread
from gspread_dataframe import set_with_dataframe

os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT
auth.authenticate_user()
credentials, project = google.auth.default()
client = bigquery.Client()
gc = gspread.authorize(credentials)

try:
  ss = gc.open_by_url(existing_spreadsheet_url)
  print(f'Using existing spreadsheet: {ss.url}')
except:
  ss = gc.create(spreadsheet_name)
  print(f'Created a new spreadsheet: {spreadsheet_name}: {ss.url}')
existing_sheets = [s.title for s in ss.worksheets()]

Using existing spreadsheet: https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg


In [34]:
# @title Upload query results
import glob
import re
from tabulate import tabulate
from IPython.display import clear_output


filename_include = '^.+sql$' # @param {type: "raw", placeholder: "Enter regexp"}
filename_exclude = '^(common_ads_variables|top_direct_sellers).sql$' # @param {type: "raw", placeholder: "Enter regexp"}
filename_include_regexp = r'{}'.format(filename_include)
filename_exclude_regexp = r'{}'.format(filename_exclude)

dry_run = True # @param {type: "boolean"}
overwrite_sheets = False # @param {type: "boolean"}
max_tb_billed = None # @param {type: "raw", placeholder: "Insert a number or `None` to disable"}

# Print formatted logs
queries_processed_log = []
def print_logs_table(log=None, append=True):
    if log:
        queries_processed_log.append(log)
    table = tabulate(queries_processed_log, headers=['Query name', 'TB processes/billed', 'Sheet name', 'Skip reason'], tablefmt="grid")
    if not append:
        del queries_processed_log[-1]
    clear_output(wait=True)
    print(table)


# Find matching SQL queries and save results to Google Sheets.
for filepath in glob.iglob(folder):
    filename = filepath.split('/')[-1]

    print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)

    if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):

        with open(filepath) as f:
            query = f.read()

        try:
            response = client.query(
                query,
                job_config = bigquery.QueryJobConfig(dry_run = True)
            )
        except Exception as e:
            print_logs_table([filename, None, None, f'DryRun Query error:\n{e}'])
            continue

        tb_processed = response.total_bytes_processed/1024/1024/1024/1024

        if dry_run:
            print_logs_table([filename, f'{tb_processed:.3f}', None, 'Dry run'])
            continue

        if max_tb_billed and max_tb_billed <= tb_processed:
            print_logs_table([filename, f'{tb_processed:.3f}', None, 'Data Volume'])
            continue

        sheet_title = re.sub(r'(\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()
        if sheet_title in existing_sheets:
            if not overwrite_sheets:
                print_logs_table([filename, f'{tb_processed:.3f}', None, 'Sheet already exists'])
                continue

            else:
                st = ss.worksheet(sheet_title)
                ss.del_worksheet(st)

        try:
            response = client.query(
                query,
                job_config = bigquery.QueryJobConfig(
                    maximum_bytes_billed = max_tb_billed*1024*1024*1024*1024
                )
            )
        except Exception as e:
            print_logs_table([filename, None, None, f'Query error:\n{e}'])
            continue

        df = response.to_dataframe()
        tb_billed = response.total_bytes_billed/1024/1024/1024/1024
        rows, cols = df.shape

        st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)
        set_with_dataframe(st, df)
        print_logs_table([filename, None, f'{tb_billed:.3f}', sheet_title])

    else:
        print_logs_table([filename, None, None, 'Filename mismatch'])

+---------------------------------------------------------------------------+-----------------------+--------------+------------------------------------------------------------------------------------------------------------------------------------+
| Query name                                                                |   TB processes/billed | Sheet name   | Skip reason                                                                                                                        |
| number_of_privacy_sandbox_attested_domains.sql                            |                       |              | Query error:                                                                                                                       |
|                                                                           |                       |              | 400 POST https://bigquery.googleapis.com/bigquery/v2/projects/httparchive/jobs?prettyPrint=false: Unrecognized name: rws at [23:5] |
