# Build and Optimize Data Warehouses with BigQuery

## Working with JSON, Arrays, and Structs in BigQuery

In [1]:
%%bash

pip install -U pip
pip install -U wheel setuptools
pip install -U numpy pandas matplotlib seaborn pyarrow tqdm
pip install -U google-cloud-bigquery google-cloud-bigquery-storage



In [2]:
import os
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/meng/work/.GCP_SA/mlee-claritas-bigdata-poc.json"

In [3]:
%load_ext google.cloud.bigquery

### Arrays

In [4]:
%%bigquery
SELECT
  ['raspberry', 'blackberry', 'strawberry', 'cherry'] AS fruit_array

Query complete after 0.01s: 100%|██████████| 1/1 [00:00<00:00, 510.07query/s]                          
Downloading: 100%|██████████| 1/1 [00:03<00:00,  3.57s/rows]


Unnamed: 0,fruit_array
0,"[raspberry, blackberry, strawberry, cherry]"


In [5]:
%%bigquery
SELECT
  fullVisitorId,
  date,
  ARRAY_AGG(v2ProductName) AS products_viewed,
  ARRAY_AGG(pageTitle) AS pages_viewed
  FROM `data-to-insights.ecommerce.all_sessions`
WHERE visitId = 1501570398
GROUP BY fullVisitorId, date
ORDER BY date

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1168.98query/s]                        
Downloading: 100%|██████████| 2/2 [00:02<00:00,  1.45s/rows]


Unnamed: 0,fullVisitorId,date,products_viewed,pages_viewed
0,5710379250208908569,20170731,[Google Women's Lightweight Microfleece Jacket...,"[Google Snapback Hat Black, Google RFID Journal]"
1,5710379250208908569,20170801,"[Rocket Flashlight, Rubber Grip Ballpoint Pen ...","[Electronics | Google Merchandise Store, Writi..."


In [6]:
%%bigquery
SELECT
  fullVisitorId,
  date,
  ARRAY_AGG(v2ProductName) AS products_viewed,
  ARRAY_LENGTH(ARRAY_AGG(v2ProductName)) AS num_products_viewed,
  ARRAY_AGG(pageTitle) AS pages_viewed,
  ARRAY_LENGTH(ARRAY_AGG(pageTitle)) AS num_pages_viewed
  FROM `data-to-insights.ecommerce.all_sessions`
WHERE visitId = 1501570398
GROUP BY fullVisitorId, date
ORDER BY date

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1589.76query/s]                        
Downloading: 100%|██████████| 2/2 [00:03<00:00,  1.76s/rows]


Unnamed: 0,fullVisitorId,date,products_viewed,num_products_viewed,pages_viewed,num_pages_viewed
0,5710379250208908569,20170731,[Google Women's Lightweight Microfleece Jacket...,2,"[Google Snapback Hat Black, Google RFID Journal]",2
1,5710379250208908569,20170801,"[Google 4400mAh Power Bank, Google Device Hold...",109,"[Electronics | Google Merchandise Store, Elect...",109


In [7]:
%%bigquery
SELECT
  fullVisitorId,
  date,
  ARRAY_AGG(DISTINCT v2ProductName) AS products_viewed,
  ARRAY_LENGTH(ARRAY_AGG(DISTINCT v2ProductName)) AS distinct_products_viewed,
  ARRAY_AGG(DISTINCT pageTitle) AS pages_viewed,
  ARRAY_LENGTH(ARRAY_AGG(DISTINCT pageTitle)) AS distinct_pages_viewed
  FROM `data-to-insights.ecommerce.all_sessions`
WHERE visitId = 1501570398
GROUP BY fullVisitorId, date
ORDER BY date

Query complete after 0.00s: 100%|██████████| 5/5 [00:00<00:00, 2549.11query/s]                        
Downloading: 100%|██████████| 2/2 [00:03<00:00,  1.75s/rows]


Unnamed: 0,fullVisitorId,date,products_viewed,distinct_products_viewed,pages_viewed,distinct_pages_viewed
0,5710379250208908569,20170731,[Google Women's Lightweight Microfleece Jacket...,2,"[Google Snapback Hat Black, Google RFID Journal]",2
1,5710379250208908569,20170801,"[Android Wool Heather Cap Heather/Black, Andro...",61,"[Shop by Brand | Google Merchandise Store, Off...",8


In [9]:
%%bigquery
SELECT DISTINCT
  visitId,
  h.page.pageTitle
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`,
UNNEST(hits) AS h
WHERE visitId = 1501570398
LIMIT 10

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1845.81query/s]                        
Downloading: 100%|██████████| 9/9 [00:03<00:00,  2.41rows/s]


Unnamed: 0,visitId,pageTitle
0,1501570398,Fun | Accessories | Google Merchandise Store
1,1501570398,Home
2,1501570398,Shop by Brand | Google Merchandise Store
3,1501570398,Office | Google Merchandise Store
4,1501570398,Other | Office | Google Merchandise Store
5,1501570398,Writing Instruments | Office | Google Merchand...
6,1501570398,Accessories | Google Merchandise Store
7,1501570398,Electronics | Google Merchandise Store
8,1501570398,Apparel | Google Merchandise Store


### STRUCTs

A separate table that is already pre-joined into main table.

A STRUCT can have:
- one or many fields in it
- the same or different data types for each field
- it's own alias

In [11]:
%%bigquery df
SELECT
  visitId,
  totals.*,
  device.*
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`
WHERE visitId = 1501570398
LIMIT 10

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 787.37query/s] 
Downloading: 100%|██████████| 1/1 [00:04<00:00,  4.39s/rows]


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   visitId                    1 non-null      int64  
 1   visits                     1 non-null      int64  
 2   hits                       1 non-null      int64  
 3   pageviews                  1 non-null      int64  
 4   timeOnSite                 1 non-null      int64  
 5   bounces                    0 non-null      float64
 6   transactions               0 non-null      float64
 7   transactionRevenue         0 non-null      float64
 8   newVisits                  1 non-null      int64  
 9   screenviews                0 non-null      float64
 10  uniqueScreenviews          0 non-null      float64
 11  timeOnScreen               0 non-null      float64
 12  totalTransactionRevenue    0 non-null      float64
 13  sessionQualityDim          1 non-null      int64  
 14

In [13]:
%%bigquery
SELECT STRUCT("Rudisha" as name, 23.4 as split) as runner

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 532.27query/s]                          
Downloading: 100%|██████████| 1/1 [00:03<00:00,  3.56s/rows]


Unnamed: 0,runner
0,"{'name': 'Rudisha', 'split': 23.4}"


In [14]:
%%bigquery
SELECT STRUCT("Rudisha" as name, [23.4, 26.3, 26.4, 26.1] as splits) AS runner

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 739.74query/s]                          
Downloading: 100%|██████████| 1/1 [00:03<00:00,  3.46s/rows]


Unnamed: 0,runner
0,"{'name': 'Rudisha', 'splits': [23.4, 26.3, 26...."


In [None]:
SELECT race, participants.name
FROM racing.race_results
CROSS JOIN
race_results.participants # full STRUCT name

In [None]:
SELECT race, participants.name
FROM racing.race_results AS r, r.participants

In [None]:
SELECT COUNT(p.name) AS racer_count
FROM racing.race_results AS r, UNNEST(r.participants) AS p

In [None]:
SELECT
  p.name,
  SUM(split_times) as total_race_time
FROM racing.race_results AS r
, UNNEST(r.participants) AS p
, UNNEST(p.splits) AS split_times
WHERE p.name LIKE 'R%'
GROUP BY p.name
ORDER BY total_race_time ASC;

In [None]:
SELECT
  p.name,
  split_time
FROM racing.race_results AS r
, UNNEST(r.participants) AS p
, UNNEST(p.splits) AS split_time
WHERE split_time = 23.2;