<a href="https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/main/sql/util/bq_to_sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# @title Configuration
GCP_PROJECT = 'httparchive' #@param {type: "string"}
almanac_year = 2025 #@param {type: "integer"}
chapter_name = 'privacy' #@param {type: "string"}
spreadsheet_url = 'https://docs.google.com/spreadsheets/d/1Svyw40Th7VbigX6lpR1lb1WXwTUVKZWrK7O2YELrml4/edit' #@param {type: "string", placeholder:"Enter spreadsheet URL"}

In [None]:
# @title Download repo (Colab only - skip when running locally)
!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git
!cd almanac.httparchive.org/

In [2]:
# @title Update chapter branch (Colab only - skip when running locally)
branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'
print(f"Switching to branch: {branch_name}")
!git checkout $branch_name && git pull

Branch: privacy-sql-2025
M	sql/util/bq_to_sheets.ipynb
M	src/requirements.txt
Already on 'privacy-sql-2025'
Your branch is up to date with 'origin/privacy-sql-2025'.
Already up to date.


In [None]:
# @title Authenticate (Colab only - skip when running locally)
from google.colab import auth
auth.authenticate_user()

In [12]:
# @title Setup BigQuery and Google Sheets clients
import google.auth
import os
from google.cloud import bigquery
import gspread
from gspread_dataframe import set_with_dataframe

os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT

# Authenticate with required scopes for BigQuery and Google Sheets
SCOPES = [
    'https://www.googleapis.com/auth/spreadsheets',
    'https://www.googleapis.com/auth/drive',
    'https://www.googleapis.com/auth/bigquery'
]

credentials, project = google.auth.default(scopes=SCOPES)
client = bigquery.Client(credentials=credentials)
gc = gspread.authorize(credentials)

# Connect to spreadsheet
ss = gc.open_by_url(spreadsheet_url)
existing_sheets = [s.title for s in ss.worksheets()]
print(f"✓ Connected to spreadsheet with {len(existing_sheets)} existing sheets")

✓ Connected to spreadsheet with 28 existing sheets


In [38]:
# @title Upload query results to Google Sheets
import glob
import re
from tabulate import tabulate
from IPython.display import clear_output, display, HTML

# Query filters and options
filename_match = 'most_common_cmps_for_iab_tcf_v2.sql' # @param {type: "raw", placeholder: "Enter regexp wrapped in quotes"}
filename_match_exclude = '' # @param {type: "raw", placeholder: "Enter regexp wrapped in quotes"}
dry_run = False # @param {type: "boolean"}
overwrite_sheets = True # @param {type: "boolean"}
maximum_tb_billed = 7 # @param {type: "raw", placeholder: "Max TB to bill per query"}

# Setup file filters
filename_include_regexp = r'.*' if not filename_match or filename_match == '*' else filename_match
filename_exclude_regexp = r'^$' if not filename_match_exclude else filename_match_exclude

# Build path to SQL files
sql_folder = os.path.join(os.getcwd(), '../', str(almanac_year), chapter_name.lower(), '*.sql')
print(f"Looking for SQL files in: {sql_folder}")
sql_files = sorted(glob.glob(sql_folder))

if not sql_files:
    print("❌ No SQL files found. Check the folder path.")
else:
    print(f"Found {len(sql_files)} SQL files\n")

    # Progress tracking
    queries_processed_log = []

    def log_result(filename, tb_processed=None, sheet_name=None, skip_reason=None, preview=False):
        """Add result to log and display table"""
        log_entry = [filename, tb_processed, sheet_name, skip_reason]
        if not preview:
            queries_processed_log.append(log_entry)

        # Build table from current log plus preview entry if needed
        display_log = queries_processed_log if not preview else queries_processed_log + [log_entry]
        table = tabulate(display_log, headers=['Query', 'TB Billed', 'Sheet', 'Status/Skip Reason'], tablefmt="grid")
        clear_output(wait=True)
        print(table)

    # Process each SQL file
    for filepath in sql_files:
        filename = os.path.basename(filepath)

        # Show processing status
        log_result(filename, 'Processing...', 'Processing...', 'Processing...', preview=True)

        # Check if filename matches filters
        if not re.search(filename_include_regexp, filename) or re.search(filename_exclude_regexp, filename):
            log_result(filename, None, None, 'Filename filter mismatch')
            continue

        # Read query
        with open(filepath) as f:
            query = f.read()

        # Estimate query cost (dry run)
        try:
            dry_run_response = client.query(query, job_config=bigquery.QueryJobConfig(dry_run=True))
            tb_processed = dry_run_response.total_bytes_processed / 1024**4
        except Exception as e:
            log_result(filename, None, None, f'Dry run error: {str(e)[:100]}...')
            continue

        # Generate sheet title from filename
        sheet_title = re.sub(r'(\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()

        # Skip execution if dry run mode
        if dry_run:
            log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Dry run mode')
            continue

        # Check if sheet already exists
        if sheet_title in existing_sheets and not overwrite_sheets:
            log_result(filename, f'{tb_processed:.3f}', sheet_title, 'Sheet exists (set overwrite_sheets=True)')
            continue

        # Execute query and upload to Sheets
        try:
            # Run query with billing limit
            job_config = bigquery.QueryJobConfig()
            if maximum_tb_billed:
                job_config.maximum_bytes_billed = int(maximum_tb_billed * 1024**4)

            query_response = client.query(query, job_config=job_config)
            df = query_response.to_dataframe()

            # Get or create sheet
            if sheet_title in existing_sheets:
                sheet = ss.worksheet(sheet_title)
            else:
                sheet = ss.add_worksheet(sheet_title, rows=1, cols=1)
                existing_sheets.append(sheet_title)

            # Upload data
            set_with_dataframe(sheet, df, resize=False)

            tb_billed = query_response.total_bytes_billed / 1024**4
            log_result(filename, f'{tb_billed:.3f}', sheet_title, '✓ Uploaded')

        except Exception as e:
            log_result(filename, f'{tb_billed:.3f}', None, f'Query error: {str(e)[:100]}...')

    print(f"\n✓ Processed {len(queries_processed_log)} queries")

+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+
| Query                                                                     |   TB Billed | Sheet                           | Status/Skip Reason       |
| cookies_top_first_party_names.sql                                         |             |                                 | Filename filter mismatch |
+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+
| cookies_top_third_party_domains.sql                                       |             |                                 | Filename filter mismatch |
+---------------------------------------------------------------------------+-------------+---------------------------------+--------------------------+
| cookies_top_third_party_names.sql                                         |     