<a href="https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/main/sql/util/bq_to_sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Configure the chapter to process
GCP_PROJECT = 'httparchive' #@param {type: "string"}
almanac_year = 2025 #@param {type: "integer"}
chapter_name = 'privacy' #@param {type: "string"}
spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4/edit' #@param {type: "string", placeholder:"Enter spreadsheet URL"}

In [None]:
# @title Download repo (skip when running locally)
# !git clone https://github.com/HTTPArchive/almanac.httparchive.org.git

In [38]:
# @title Update chapter branch (skip when running locally)
branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'
print(f"Branch: {branch_name}")
# !cd almanac.httparchive.org/ && git checkout $branch_name && git pull

Branch: privacy-sql-2025


In [None]:
# Run to authenticate if in Colab (skip when running locally)
# from google.colab import auth
# auth.authenticate_user()

In [39]:
# Run to authenticate if not in Colab
# Prepare the environments as described in src/README.md
!pip install gspread gspread_dataframe tabulate -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [40]:
# @title Authenticate
import google.auth
import os
from google.cloud import bigquery

import gspread
from gspread_dataframe import set_with_dataframe

os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT

credentials, project = google.auth.default()
client = bigquery.Client()
gc = gspread.authorize(credentials)

try:
    ss = gc.open_by_url(spreadsheet_url)
    existing_sheets = [s.title for s in ss.worksheets()]
    print(f"Successfully connected to spreadsheet with {len(existing_sheets)} existing sheets")
except Exception as e:
    print(f'Spreadsheet authentication failed: {e}')
    print("Note: Make sure you have access to the spreadsheet and proper Google credentials")
    ss = None
    existing_sheets = []

Spreadsheet authentication failed: 
Note: Make sure you have access to the spreadsheet and proper Google credentials


In [41]:
# @title Upload query results

import glob
import re
from tabulate import tabulate
from IPython.display import clear_output
import os

filename_match = '\\.sql$' # @param {type: "raw", placeholder: "Enter regexp wrapped in quotes"}
filename_match_exclude = '^$' # @param {type: "raw", placeholder: "Enter regexp wrapped in quotes"}
dry_run = True # @param {type: "boolean"}
overwrite_sheets = False # @param {type: "boolean"}
maximum_tb_billed = 0.5 # @param {type: "raw", placeholder: "Insert a number or empty to disable"}

filename_include_regexp = r'{}'.format(filename_match)
filename_exclude_regexp = r'{}'.format(filename_match_exclude)

folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')

print(f"Looking for SQL files in: {folder}")

# Print formatted logs
queries_processed_log = []
def print_logs_table(log=None, append=True):
    if log:
        queries_processed_log.append(log)
    table = tabulate(queries_processed_log, headers=['Query name', 'TB processed - estimate', 'Sheet name', 'Upload skipped reason'], tablefmt="grid")
    if not append:
        del queries_processed_log[-1]
    clear_output(wait=True)
    print(table)

# Find matching SQL queries and save results to Google Sheets.
sql_files = list(glob.iglob(folder))
print(f"Found {len(sql_files)} SQL files")

if not sql_files:
    print("No SQL files found. Check the folder path.")
else:
    for filepath in sorted(sql_files):
        filename = os.path.basename(filepath)

        print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)

        if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):

            with open(filepath) as f:
                query = f.read()

            try:
                response = client.query(
                    query,
                    job_config = bigquery.QueryJobConfig(dry_run = True)
                )
            except Exception as e:
                print_logs_table([filename, None, None, f'Dry run query error:\n{e}'])
                continue

            tb_processed = response.total_bytes_processed/1024/1024/1024/1024
            sheet_title = re.sub(r'(\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()

            if sheet_title in existing_sheets:
                if overwrite_sheets:
                    st = ss.worksheet(sheet_title)
                else:
                    print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])
                    continue

            if dry_run:
                print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])
                continue

            # Skip actual execution if no spreadsheet connection
            if ss is None:
                print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'No spreadsheet connection'])
                continue

            try:
                if maximum_tb_billed:
                    response = client.query(
                        query,
                        job_config = bigquery.QueryJobConfig(
                            maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024
                        )
                    )
                else:
                    response = client.query(query)

                df = response.to_dataframe()
                if ('st' not in locals() or st.title != sheet_title):
                    st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)
                set_with_dataframe(st, df, resize=False)

                tb_billed = response.total_bytes_billed/1024/1024/1024/1024
                print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])

            except Exception as e:
                print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\n{e}'])
                continue

        else:
            print_logs_table([filename, None, None, 'Filename mismatch'])

+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+
| Query name                                                                |   TB processed/billed | Sheet name                                                            | Upload skipped reason   |
| cookies_top_first_party_names.sql                                         |                 0.081 | Cookies Top First Party Names                                         | Dry run                 |
+---------------------------------------------------------------------------+-----------------------+-----------------------------------------------------------------------+-------------------------+
| cookies_top_third_party_domains.sql                                       |                 0.083 | Cookies Top Third Party Domains                                       | Dry run                 |
