<a href="https://colab.research.google.com/github/HTTPArchive/almanac.httparchive.org/blob/fellow-vicuna/sql/util/bq_to_sheets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# @title Configure the chapter
year = 2024 #@param {type: "integer"}
chapter = "privacy" #@param {type: "string"}

# BigQuery
GCP_PROJECT = "httparchive" #@param {type: "string"}

# Git
branch_name = f"{chapter.lower()}-sql-{year}"

# SQL folder
folder = r'almanac.httparchive.org/sql/{year}/{chapter}/*.sql'.format(
    year=year,
    chapter=chapter.lower()
)

# Google Sheets
spreadsheet_name = f"{chapter.capitalize()} (Web Almanac {year})"

# Set to `None` to create new one or an existing spreadsheet URL.
existing_spreadsheet_url = 'https://docs.google.com/spreadsheets/d/18r8cT6x9lPdM-rXvXjsqx84W7ZDdTDYGD59xr0UGOwg/edit' #@param {type: "string"}

In [None]:
# @title Download repo
!git clone -b $branch_name https://github.com/HTTPArchive/almanac.httparchive.org.git

In [None]:
# @title Update local branch (if new commits)
!cd almanac.httparchive.org/ && git checkout $branch_name && git pull

In [None]:
# @title Authenticate
import google.auth
import os
from google.colab import auth
from google.cloud import bigquery

import gspread
from gspread_dataframe import set_with_dataframe

os.environ["GOOGLE_CLOUD_PROJECT"] = GCP_PROJECT
auth.authenticate_user()
credentials, project = google.auth.default()
client = bigquery.Client()
gc = gspread.authorize(credentials)

try:
  ss = gc.open_by_url(existing_spreadsheet_url)
  print(f'Using existing spreadsheet: {ss.url}')
except:
  ss = gc.create(spreadsheet_name)
  print(f'Created a new spreadsheet: {spreadsheet_name}: {ss.url}')
existing_sheets = [s.title for s in ss.worksheets()]

In [None]:
# @title Upload query results
import glob
import re
from tabulate import tabulate
from IPython.display import clear_output


include_regexp = '^.+sql$' # @param {type: "string"}
exclude_regexp = "^$" # @param {type: "string"}
file_match_include = r'{}'.format(include_regexp)
file_match_exclude = r'{}'.format(exclude_regexp)

overwrite = False # @param {type: "boolean"}
dry_run = False # @param {type: "boolean"}
tb_processed_limit = 0 # @param {type: "number"}

# Print formatted logs
queries_processed_log = []
def print_logs_table():
    table = tabulate(queries_processed_log, headers=['Query name', 'Skip reason', 'Total Tb billed', 'Sheet name'], tablefmt="grid")
    clear_output(wait=True)
    print(table)

# Find matching SQL queries and save results to Google Sheets.
for filepath in glob.iglob(folder):
    filename = filepath.split('/')[-1]

    queries_processed_log.append([filename, 'Processing...', 'Processing...', 'Processing...'])
    print_logs_table()
    del queries_processed_log[-1]

    if re.search(file_match_include, filename) \
    and not re.search(file_match_exclude, filename):

        with open(filepath) as f:
            query = f.read()

        response = client.query(
            query,
            job_config = bigquery.QueryJobConfig(dry_run = True)
        )

        tb_processed = response.total_bytes_processed/1024/1024/1024/1024

        if dry_run:
            queries_processed_log.append([filename, 'Dry run', f'{tb_processed:.3f}', None])
            continue

        if tb_processed_limit and tb_processed_limit <= tb_processed:
            queries_processed_log.append([filename, 'Data Volume', f'{tb_processed:.3f}', None])
            continue

        sheet_title = re.sub(r'(\.sql|[^a-zA-Z0-9]+)', ' ', filename).strip().title()
        if sheet_title in existing_sheets:
            if not overwrite:
                queries_processed_log.append([filename, 'No ovrwrite', f'{tb_processed:.3f}', None])
                continue

            else:
                st = ss.worksheet(sheet_title)
                ss.del_worksheet(st)

        df = client.query(query).to_dataframe()
        rows, cols = df.shape

        st = ss.add_worksheet(title = sheet_title, rows = rows, cols = cols)
        set_with_dataframe(st, df)
        queries_processed_log.append([filename, None, f'{tb_processed:.3f}', sheet_title])

    else:
        queries_processed_log.append([filename, 'Regex match', None, None])

    print_logs_table()

print_logs_table()