<a href="https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/main/sql/util/bq_to_sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Download repo
!git clone https://github.com/HTTPArchive/almanac.httparchive.org.git

In [None]:
# @title Configure the chapter to process
GCP_PROJECT = 'httparchive' #@param {type: "string"}
almanac_year = 2024 #@param {type: "integer"}
chapter_name = 'privacy' #@param {type: "string"}
spreadsheet_url = 'https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit' #@param {type: "string", placeholder:"Enter spreadsheet URL"}

In [None]:
# @title Update chapter branch
branch_name = f'{chapter_name.lower()}-sql-{almanac_year}'
!cd almanac.httparchive.org/ && git checkout $branch_name && git pull

In [None]:
# @title Authenticate
import google.auth
import os
from google.colab import auth
from google.cloud import bigquery

import gspread
from gspread_dataframe import set_with_dataframe


os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT
auth.authenticate_user()
credentials, project = google.auth.default()
client = bigquery.Client()
gc = gspread.authorize(credentials)

try:
    ss = gc.open_by_url(spreadsheet_url)
except:
    print('Spreadsheet not found')

In [None]:
# @title Upload query results

import glob
import re
from tabulate import tabulate
from IPython.display import clear_output


filename_match = '(number_of_websites_with_related_origin_trials|most_common_cname_domains)\.sql' # @param {type: "raw", placeholder: "Enter regexp wrapped in quotes"}
filename_match_exclude = '(ads_and_sellers_graph)\.sql' # @param {type: "raw", placeholder: "Enter regexp wrapped in quotes"}
dry_run = True # @param {type: "boolean"}
overwrite_sheets = True # @param {type: "boolean"}
maximum_tb_billed = None # @param {type: "raw", placeholder: "Insert a number or empty to disable"}

filename_include_regexp = r'{}'.format(filename_match)
filename_exclude_regexp = r'{}'.format(filename_match_exclude)
folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(
    year=almanac_year,
    chapter=chapter_name.lower()
)
existing_sheets = [s.title for s in ss.worksheets()]

# Print formatted logs
queries_processed_log = []
def print_logs_table(log=None, append=True):
    if log:
        queries_processed_log.append(log)
    table = tabulate(queries_processed_log, headers=['Query name', 'TB processed/billed', 'Sheet name', 'Upload skipped reason'], tablefmt="grid")
    if not append:
        del queries_processed_log[-1]
    clear_output(wait=True)
    print(table)

# Find matching SQL queries and save results to Google Sheets.
for filepath in sorted(glob.iglob(folder)):
    filename = filepath.split('/')[-1]

    print_logs_table([filename, 'Processing...', 'Processing...', 'Processing...'], append=False)

    if re.search(filename_include_regexp, filename) and not re.search(filename_exclude_regexp, filename):

        with open(filepath) as f:
            query = f.read()

        try:
            response = client.query(
                query,
                job_config = bigquery.QueryJobConfig(dry_run = True)
            )
        except Exception as e:
            print_logs_table([filename, None, None, f'Dry run query error:\n{e}'])
            continue

        tb_processed = response.total_bytes_processed/1024/1024/1024/1024
        sheet_title = re.sub(r'(\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()

        if sheet_title in existing_sheets:
            if overwrite_sheets:
                st = ss.worksheet(sheet_title)
            else:
                print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Sheet already exists'])
                continue

        if dry_run:
            print_logs_table([filename, f'{tb_processed:.3f}', sheet_title, 'Dry run'])
            continue

        try:
            if maximum_tb_billed:
                response = client.query(
                    query,
                    job_config = bigquery.QueryJobConfig(
                        maximum_bytes_billed = maximum_tb_billed*1024*1024*1024*1024
                    )
                )
            else:
                response = client.query(query)

            df = response.to_dataframe()
            if ('st' not in locals() or st.title != sheet_title):
                st = ss.add_worksheet(sheet_title, rows = 1, cols = 1)
            set_with_dataframe(st, df, resize=False)

            tb_billed = response.total_bytes_billed/1024/1024/1024/1024
            print_logs_table([filename, f'{tb_billed:.3f}', sheet_title, None])

        except Exception as e:
            print_logs_table([filename, f'{tb_processed:.3f}', None, f'Query error:\n{e}'])
            continue

    else:
        print_logs_table([filename, None, None, 'Filename mismatch'])