-
Notifications
You must be signed in to change notification settings - Fork 70
/
sp_demo_script.sql
209 lines (166 loc) · 9.77 KB
/
sp_demo_script.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/*##################################################################################
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
/*
YouTube:
- https://youtu.be/Yru78Pk1jMM
Use Cases:
- Analytical Lakehouse
Description:
- This is the Main Demo script for the entire demo
Show:
- The entire demo and architecture slide
References:
-
Clean up / Reset script:
DROP ALL ROW ACCESS POLICIES ON `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_zone_csv`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.rideshare_plus_rides`;
DROP EXTERNAL TABLE IF EXISTS `${project_id}.${aws_omni_biglake_dataset_name}.rideshare_plus_rides`;
*/
--************************************************************************************************************
-- RAW Zone
--************************************************************************************************************
--------------------------------------------------------------------------------------------------------------
-- BigLake Row Level Security and Data Masking
--------------------------------------------------------------------------------------------------------------
-- Select parquet data
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_trip_parquet` LIMIT 1000;
-- Show CSV table (all data)
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_zone_csv`;
-- Query: Create an access policy so the admin (you) can only see Manhattan data
CREATE OR REPLACE ROW ACCESS POLICY rls_biglake_rideshare_zone_csv
ON `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_zone_csv`
GRANT TO ("user:${gcp_account_name}")
FILTER USING (borough = 'Manhattan');
-- See just the data you are allowed to see
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_zone_csv`;
-- Edit the table and show Data Masking on the service zone
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_zone_csv`;
-- Drop the policy (do not break things in future demo steps)
DROP ALL ROW ACCESS POLICIES ON `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_zone_csv`;
--------------------------------------------------------------------------------------------------------------
-- OMNI (Bring in data from other clouds)
--------------------------------------------------------------------------------------------------------------
-- Create a table on data in AWS (S3)
CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${aws_omni_biglake_dataset_name}.rideshare_plus_rides`
WITH PARTITION COLUMNS (
year INTEGER, -- column order must match the external path
month INTEGER
)
WITH CONNECTION `${shared_demo_project_id}.${aws_omni_biglake_dataset_region}.${aws_omni_biglake_connection}`
OPTIONS (
format = "PARQUET",
hive_partition_uri_prefix = "s3://${aws_omni_biglake_s3_bucket}/taxi-data/yellow/trips_table/parquet/",
uris = ['s3://${aws_omni_biglake_s3_bucket}/taxi-data/yellow/trips_table/parquet/*.parquet']
);
-- Retrieve some aggregate data from AWS
-- Use CTAS to query and directy load into a local BigQuery table
CREATE OR REPLACE TABLE `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.rideshare_plus_rides` AS
SELECT CAST(Pickup_DateTime AS DATE) AS PickupDate,
CAST(Dropoff_DateTime AS DATE) AS DropoffDate,
PULocationID AS PickupLocationId,
DOLocationID AS DropoffLocationId,
AVG(Passenger_Count) AS AvgPassengerCnt,
AVG(Tip_Amount) AS AvgTipAmt,
AVG(Total_Amount) AS AvgTotalAmt
FROM `${project_id}.${aws_omni_biglake_dataset_name}.rideshare_plus_rides`
WHERE year=2022
AND month=1
GROUP BY 1, 2, 3, 4;
-- We now have data in Google BigQuery to used for our analysis
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.rideshare_plus_rides`;
--------------------------------------------------------------------------------------------------------------
-- Unstructured Data (Object table)
--------------------------------------------------------------------------------------------------------------
-- Create a table over GCS (we can now see our data lake in a table)
-- Only the connection need access to the lake, not individual users which becomes unmanageable with billions/trillions of files
/*
CREATE OR REPLACE EXTERNAL TABLE `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_images`
WITH CONNECTION `${project_id}.${bigquery_region}.biglake-connection`
OPTIONS (
object_metadata="DIRECTORY",
uris = ['gs://${gcs_rideshare_lakehouse_raw_bucket}/rideshare_images/*.jpg',
'gs://${gcs_rideshare_lakehouse_raw_bucket}/rideshare_images/*.jpeg'],
max_staleness=INTERVAL 30 MINUTE,
metadata_cache_mode="AUTOMATIC"
);
*/
-- Show our objects in GCS / Data Lake
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_images`;
-- Metadata values are recorded as to where the image was taken
SELECT *
FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_images`
WHERE ARRAY_LENGTH(metadata) > 0;
--------------------------------------------------------------------------------------------------------------
-- DataFlow (streaming ingestion)
--------------------------------------------------------------------------------------------------------------
-- Query raw data
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.bigquery_streaming_rideshare_trips` LIMIT 100;
-- Show streaming base table: taxi_dataset.taxi_trips_streaming
--************************************************************************************************************
-- Enriched Zone
--************************************************************************************************************
--------------------------------------------------------------------------------------------------------------
-- Unstructured Data Analysis
--------------------------------------------------------------------------------------------------------------
-- Process the images via our object table and show the results
WITH UnstructuredData AS
(
-- get the image from the oject table
SELECT *
FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_images`
WHERE uri = (SELECT uri FROM `${project_id}.${bigquery_rideshare_lakehouse_raw_dataset}.biglake_rideshare_images` LIMIT 1)
)
, ScoreAI AS
(
-- call a remote function
SELECT `${project_id}.${bigquery_rideshare_lakehouse_enriched_dataset}.ext_udf_ai_localize_objects`(UnstructuredData.uri) AS json_result
FROM UnstructuredData
)
SELECT item.name,
item.score,
ScoreAI.json_result
FROM ScoreAI, UNNEST(JSON_QUERY_ARRAY(ScoreAI.json_result.localized_object_annotations)) AS item;
-- Show all the images processed (object detection, labels, landmarks, logos)
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_enriched_dataset}.bigquery_rideshare_images_ml_score` LIMIT 100;
--------------------------------------------------------------------------------------------------------------
-- Iceberg
--------------------------------------------------------------------------------------------------------------
-- Payment Data:
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_enriched_dataset}.biglake_rideshare_payment_type_iceberg` LIMIT 1000;
-- Trip Data:
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_enriched_dataset}.biglake_rideshare_trip_iceberg` LIMIT 1000;
--************************************************************************************************************
-- Curated Zone
--************************************************************************************************************
--------------------------------------------------------------------------------------------------------------
-- Unstructured Data Analysis (structured for consumption)
--------------------------------------------------------------------------------------------------------------
-- BigSearch
-- How many records are we searching
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_trip`;
-- Find the credit card 6701-1287-5578-5710 (NOTE: You might need to change this since this is generated data)
-- The table is not partitioned or clustered on this data
-- Click on Job Information and show "Index Usage Mode: FULLY_USED"
SELECT *
FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_trip`
WHERE SEARCH(credit_card_number,'6701-1287-5578-5710', analyzer=>'NO_OP_ANALYZER');
-- Search even if we do not know the field name
SELECT *
FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_trip` AS bigquery_rideshare_trip
WHERE SEARCH(bigquery_rideshare_trip,'6701-1287-5578-5710', analyzer=>'NO_OP_ANALYZER');
-- See curated image data
SELECT * FROM `${project_id}.${bigquery_rideshare_lakehouse_curated_dataset}.bigquery_rideshare_images_ml_detection` LIMIT 1000;